From c0e63f213290298ac6fb50091153193626a5a1a7 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Wed, 22 Apr 2026 16:56:25 +0000 Subject: [PATCH 01/29] Add feed operations spec for Cosmos driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defines the design for feed operations (queries, read-many, change feed) in the driver crate, covering the Plan → Execute model, OperationPayload/OperationTarget refactors, continuation tokens, OpenTelemetry integration, and partition split handling. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../docs/FEED_OPERATIONS_SPEC.md | 1333 +++++++++++++++++ 1 file changed, 1333 insertions(+) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md new file mode 100644 index 00000000000..fe4ed144f19 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -0,0 +1,1333 @@ +# Feed Operations Spec for `azure_data_cosmos_driver` + +**Status:** Draft / Iterating +**Date:** 2026-04-21 +**Authors:** (team) +**Crate:** `azure_data_cosmos_driver` + +--- + +## Table of Contents + +1. [Goals & Motivation](#1-goals--motivation) +2. [Architectural Overview](#2-architectural-overview) +3. [CosmosOperation Changes](#3-cosmosoperation-changes) +4. [Operation Plans](#4-operation-plans) +5. [Planner](#5-planner) +6. [Plan Executor](#6-plan-executor) +7. [Continuation Tokens](#7-continuation-tokens) +8. [OpenTelemetry Integration](#8-opentelemetry-integration) +9. [Error Handling & Partition Splits](#9-error-handling--partition-splits) +10. [API Semantics & Invariants](#10-api-semantics--invariants) +11. [Configuration Surface](#11-configuration-surface) +12. [Performance & Non-Regression](#12-performance--non-regression) +13. [Migration Plan](#13-migration-plan) +14. [Testing Strategy](#14-testing-strategy) +15. [Future Work](#15-future-work) + +--- + +## 1. Goals & Motivation + +### Problem Statement + +The driver currently supports only **point operations** — operations that target a single resource +and produce a single response. Operations like `ReadItem`, `UpsertItem`, and `DeleteContainer` go +through `execute_operation`, which drives the operation pipeline (region failover, session tokens, +transport retry) and returns a single `CosmosResponse`. + +**Feed operations** — queries, read-many, read-all-items, and change feed — are fundamentally +different. They produce multiple pages of results, may fan out across partition key ranges, may +require backend-provided query plans, and need pagination state that can be serialized across +request boundaries. + +Today, feed operations are handled entirely in the higher-level `azure_data_cosmos` crate, bypassing +the driver's operation pipeline. This means feed operations miss out on the driver's multi-region +failover, partition-level circuit breaker, throughput control, and diagnostics infrastructure. + +### Goals + +1. **Unified execution model** — Both point and feed operations flow through a common + Plan → Execute pipeline. Point operations produce a trivial single-step plan. Feed operations + produce multi-step plans that leverage the existing point-operation pipeline for individual + HTTP requests. + +2. **Resumable pagination** — Feed operations produce a typed continuation token that can be + serialized to a string and carried across process boundaries (e.g., sent to a browser). + Resuming with a valid continuation token and an equivalent operation descriptor continues + where the previous execution left off. + +3. **Extensible operation model** — The plan model must support ReadMany (the initial target), + cross-partition queries, single-partition queries/reads, and change feed, even if some are + implemented later. + +4. **Driver-level concerns** — Feed operations must integrate with multi-region failover, + partition-level failover (PPAF/PPCB), throughput control, session consistency, and + diagnostics — all managed by the driver. + +5. **Schema-agnostic pages** — The driver returns response pages as raw bytes (`Vec`). + The higher-level SDK handles deserialization, consistent with the existing `CosmosResponse` + model. Future work (sort, aggregate) will require the driver to understand feed envelopes, + but the initial design reserves space for this without requiring it. + +6. **Performance non-regression** — Point operations must not pay measurable overhead for the + unified plan model. Trivial plans must be allocation-light. + +### Non-Goals (This Spec) + +- Full cross-partition query execution with ORDER BY merge-sort and aggregation (future work). +- Change feed full design (future work; this spec reserves extension points). +- Client-side query rewriting or optimization. + +### Primary Target + +**ReadMany** is the first feed operation to implement. It exercises: +- Partition key range resolution (via `PartitionKeyRangeCache`) +- Fan-out across multiple partition key ranges +- Merging results into a single response +- Integration with the operation pipeline for each sub-request + +--- + +## 2. Architectural Overview + +```text +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ CosmosDriver │ +│ │ +│ execute_operation(op, opts) → CosmosResponse [point operations] │ +│ execute_feed_operation(op, opts) → FeedPager [feed operations] │ +│ │ +│ Both internally: │ +│ 1. Planner creates an OperationPlan │ +│ 2. PlanExecutor runs the plan │ +│ 3. Point ops: executor drains single page, returns CosmosResponse │ +│ 4. Feed ops: executor is wrapped in FeedPager for caller iteration │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ PLANNER │ │ +│ │ │ │ +│ │ Input: CosmosOperation + OperationOptions │ │ +│ │ Output: OperationPlan │ │ +│ │ │ │ +│ │ Responsibilities: │ │ +│ │ ┌─ Determine targeting (single PK, EPK range, all ranges) │ │ +│ │ ├─ For ReadMany: group items by PK range, create fan-out steps │ │ +│ │ ├─ For cross-partition query: fetch backend query plan, create steps │ │ +│ │ ├─ For single-partition ops: create single-step plan │ │ +│ │ └─ For point ops: create trivial single-step plan │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ PLAN EXECUTOR │ │ +│ │ │ │ +│ │ Input: OperationPlan │ │ +│ │ Output: Stream of FeedResponsePage (or single page for point ops) │ │ +│ │ │ │ +│ │ Responsibilities: │ │ +│ │ ┌─ Execute plan steps with configurable concurrency │ │ +│ │ ├─ Each step calls execute_operation_pipeline() for HTTP │ │ +│ │ ├─ Manage continuation state across turns │ │ +│ │ ├─ Handle partition splits (re-plan affected ranges) │ │ +│ │ ├─ Enforce concurrency caps for fan-out │ │ +│ │ ├─ Integrate with throughput control │ │ +│ │ ├─ Emit OpenTelemetry spans per turn │ │ +│ │ └─ Produce continuation tokens for serialization │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ OPERATION PIPELINE (existing) │ │ +│ │ │ │ +│ │ execute_operation_pipeline() — unchanged │ │ +│ │ Handles: region failover, session tokens, transport retry, auth, │ │ +│ │ 429 backoff, diagnostics │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### Layer Separation + +| Concern | Component | Location | +|---------|-----------|----------| +| Operation intent & payload | `CosmosOperation` | `models/cosmos_operation.rs` | +| Plan creation | `Planner` | `driver/feed/planner.rs` (new) | +| Plan model | `OperationPlan`, `PlanStep` | `driver/feed/plan.rs` (new) | +| Plan execution | `PlanExecutor` | `driver/feed/executor.rs` (new) | +| Public pager | `FeedPager` | `driver/feed/pager.rs` (new) | +| Continuation state | `ContinuationToken` | `models/continuation_token.rs` (new) | +| Per-step HTTP execution | `execute_operation_pipeline` | `driver/pipeline/` (existing) | + +--- + +## 3. CosmosOperation Changes + +### 3.1 OperationType Refactor + +`OperationType` currently carries no data and is `Copy`. Feed operations require variant-specific +data (query text, item lists, etc.). Rather than bloating `OperationType` with payload data — which +would break `Copy` and mix operation semantics with operation payload — we split the concern: + +- **`OperationType`** remains a lightweight, `Copy` enum describing operation semantics + (HTTP method, read-only, idempotent). Unchanged from today. + +- **`OperationPayload`** is a new enum carrying variant-specific data. It replaces the untyped + `body: Option>` field on `CosmosOperation`. + +```rust +/// Operation-specific payload data. +/// +/// Replaces the generic `body: Option>` on `CosmosOperation`. +/// Each variant carries exactly the data needed for its operation type. +#[derive(Clone, Debug)] +pub enum OperationPayload { + /// No payload needed (e.g., ReadItem, DeleteItem, ReadContainer). + None, + + /// Raw body bytes (e.g., CreateItem, UpsertItem, ReplaceItem). + /// The caller provides pre-serialized JSON. + Body(Vec), + + /// SQL query text with optional parameters (e.g., QueryItems). + Query { + /// The SQL query text. + query: String, + /// Pre-serialized parameters JSON array, if any. + parameters: Option>, + }, + + /// ReadMany item descriptors: (item_id, partition_key) pairs. + ReadMany { + /// The items to read, as (id, partition_key) pairs. + items: Vec<(String, PartitionKey)>, + }, + + // Future: ChangeFeed { mode, start_from, ... } +} +``` + +`CosmosOperation` changes from: + +```rust +pub struct CosmosOperation { + operation_type: OperationType, + resource_type: ResourceType, + resource_reference: CosmosResourceReference, + partition_key: Option, + request_headers: CosmosRequestHeaders, + body: Option>, // ← removed +} +``` + +to: + +```rust +pub struct CosmosOperation { + operation_type: OperationType, + resource_type: ResourceType, + resource_reference: CosmosResourceReference, + target: OperationTarget, + request_headers: CosmosRequestHeaders, + payload: OperationPayload, +} +``` + +### 3.2 OperationTarget + +Partition targeting is currently a single `Option` field. Feed operations introduce +additional targeting modes. These are mutually exclusive, so they become an enum: + +```rust +/// How the operation is targeted to partitions. +/// +/// Determines which partition key range(s) the operation executes against. +/// Only one targeting mode is active per operation. +#[derive(Clone, Debug)] +pub enum OperationTarget { + /// No partition targeting (account-level or database-level operations). + None, + + /// Target a specific logical partition key. + /// Used for: single-partition reads, writes, queries. + PartitionKey(PartitionKey), + + /// Target a specific effective partition key range. + /// Used for: scoped feed operations on a sub-range. + EpkRange { + min_inclusive: EffectivePartitionKey, + max_exclusive: EffectivePartitionKey, + }, + + /// Target a specific partition key range by its server-assigned ID. + /// Used for: resuming from a continuation that recorded the range ID. + PkRangeId(String), + + /// Target all partition key ranges. + /// Semantically equivalent to `EpkRange { min: "00", max: "FF" }`. + /// Used for: cross-partition queries, read-all-items, read-many. + AllRanges, +} +``` + +### 3.3 Factory Method Updates + +Existing factory methods are updated to use `OperationPayload` and `OperationTarget`: + +```rust +impl CosmosOperation { + /// Reads an item. + pub fn read_item(item: ItemReference) -> Self { + let partition_key = item.partition_key().clone(); + Self::new(OperationType::Read, item) + .with_target(OperationTarget::PartitionKey(partition_key)) + // No payload needed — item ID is in the resource reference. + } + + /// Creates an item. Use `with_body()` to provide the document JSON. + pub fn create_item(container: ContainerReference, partition_key: PartitionKey) -> Self { + let resource_ref = CosmosResourceReference::from(container) + .with_resource_type(ResourceType::Document) + .into_feed_reference(); + Self::new(OperationType::Create, resource_ref) + .with_target(OperationTarget::PartitionKey(partition_key)) + // Caller attaches body via .with_payload(OperationPayload::Body(...)) + } + + /// Queries items within a single partition. + pub fn query_items( + container: ContainerReference, + partition_key: PartitionKey, + query: impl Into, + ) -> Self { + let resource_ref = CosmosResourceReference::from(container) + .with_resource_type(ResourceType::Document) + .into_feed_reference(); + Self::new(OperationType::Query, resource_ref) + .with_target(OperationTarget::PartitionKey(partition_key)) + .with_payload(OperationPayload::Query { + query: query.into(), + parameters: None, + }) + } + + /// Queries items across all partitions. + pub fn query_items_cross_partition( + container: ContainerReference, + query: impl Into, + ) -> Self { + let resource_ref = CosmosResourceReference::from(container) + .with_resource_type(ResourceType::Document) + .into_feed_reference(); + Self::new(OperationType::Query, resource_ref) + .with_target(OperationTarget::AllRanges) + .with_payload(OperationPayload::Query { + query: query.into(), + parameters: None, + }) + } + + /// Reads multiple items by their ID/partition-key pairs. + pub fn read_many( + container: ContainerReference, + items: Vec<(String, PartitionKey)>, + ) -> Self { + let resource_ref = CosmosResourceReference::from(container) + .with_resource_type(ResourceType::Document) + .into_feed_reference(); + Self::new(OperationType::Query, resource_ref) + .with_target(OperationTarget::AllRanges) + .with_payload(OperationPayload::ReadMany { items }) + } +} +``` + +### 3.4 Backward Compatibility + +The `body: Option>` field is removed and replaced with `payload: OperationPayload`. +Factory methods that previously required `.with_body(...)` now accept the body in the factory +method or via `.with_payload(...)`. A convenience method `with_body(Vec)` can be kept as +sugar for `with_payload(OperationPayload::Body(...))`. + +The transport pipeline's request builder must be updated to extract body bytes from +`OperationPayload` when constructing the HTTP request. For `Body` and `Query` variants, this +is straightforward serialization. For `ReadMany`, the Planner decomposes the operation before +it reaches the transport pipeline, so the transport never sees a `ReadMany` payload directly. + +--- + +## 4. Operation Plans + +### 4.1 Plan Model + +An `OperationPlan` is a directed acyclic graph (DAG) of `PlanStep` nodes. Each step represents +a unit of work that produces a partial result. Steps may depend on other steps (for merge/sort), +and may produce continuations for subsequent turns. + +```rust +/// A plan for executing an operation. +/// +/// Plans range from trivial (single step for a point read) to complex +/// (fan-out across partition key ranges with merge). The plan is created +/// by the Planner and executed by the PlanExecutor. +pub(crate) struct OperationPlan { + /// The steps in this plan, indexed by StepId. + steps: Vec, + + /// Which step produces the final output. + /// For single-step plans, this is step 0. + /// For fan-out plans, this is typically a merge step. + output_step: StepId, + + /// Whether this plan supports pagination (multiple turns). + /// ReadMany plans do not paginate; query plans do. + paginates: bool, +} + +/// A unique identifier for a step within an OperationPlan. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub(crate) struct StepId(usize); + +/// A single step in an operation plan. +pub(crate) enum PlanStep { + /// Execute a single HTTP request via the operation pipeline. + /// + /// This is the leaf step that actually talks to the Cosmos DB service. + /// It carries a `CosmosOperation` configured for a specific PK range. + Fetch { + /// The operation to execute. Targeted to a specific PK range. + operation: CosmosOperation, + /// Options for this fetch. + options: OperationOptions, + /// Server-provided continuation token for this range, if resuming. + continuation: Option, + }, + + /// Merge results from multiple upstream steps with no ordering guarantee. + /// + /// Results are concatenated in the order they complete. Used by ReadMany + /// to combine results from multiple PK ranges. For ReadMany, all upstream + /// fetches are driven to completion and their results are concatenated. + /// + /// Note: Cross-partition queries without an explicit ORDER BY still return + /// results in (PartitionKey, ID) ascending order within each partition. + /// The `UnorderedMerge` step concatenates partition results but does NOT + /// sort across partitions. For globally-ordered results, use `OrderedMerge` + /// (future). + UnorderedMerge { + /// The steps whose results are merged. + inputs: Vec, + }, + + // Future variants: + // OrderedMerge { inputs: Vec, order_by: ... }, + // Aggregate { inputs: Vec, aggregation: ... }, +} +``` + +### 4.2 Plan Examples + +#### Point Operation (ReadItem) + +```text +Step 0: Fetch(read_item operation) → output +``` + +A trivial plan with one step. The executor runs it, gets a `CosmosResponse`, done. + +#### ReadMany + +```text +Step 0: Fetch(query to PK range "0", items [(pk_a, a), (pk_b, b)]) +Step 1: Fetch(query to PK range "1", items [(pk_c, c)]) +Step 2: Fetch(query to PK range "2", items [(pk_d, d), (pk_e, e), (pk_f, f)]) +Step 3: UnorderedMerge(inputs: [0, 1, 2]) → output +``` + +The executor runs steps 0–2 concurrently (subject to concurrency cap), each driving all +server-side pages to completion. Step 3 merges the fully-buffered results. + +**Optimization:** When a PK range contains only a single item, the Planner MAY optimize +the `Fetch` step to a point read (`OperationType::Read` with `OperationTarget::PartitionKey`) +instead of a query, avoiding the overhead of query parsing on the backend. + +#### Single-Partition Query + +```text +Step 0: Fetch(query to PK "my-pk", continuation: None) → output +``` + +On subsequent turns, the executor updates the continuation in step 0 and re-executes. +Each turn yields one page. + +#### Cross-Partition Query (Future) + +```text +Step 0: Fetch(query to PK range "0") +Step 1: Fetch(query to PK range "1") +Step 2: Fetch(query to PK range "2") +Step 3: UnorderedMerge(inputs: [0, 1, 2]) → output + (or OrderedMerge for explicit ORDER BY) +``` + +Each turn, the executor advances whichever PK range steps have results available. + +**Ordering note:** Within each partition, results are always returned in +(PartitionKey, ID) ascending order — even without an explicit `ORDER BY` clause. +The `UnorderedMerge` step concatenates partition results without cross-partition +sorting. For queries with an explicit `ORDER BY`, an `OrderedMerge` step (future) +performs a k-way merge over partition heads to produce globally ordered results. + +### 4.3 Incremental Page Production + +Plans MUST support incremental page production. The executor does NOT wait for all partition +steps to complete before emitting a page. Instead: + +- **Unordered fan-out** (ReadMany, cross-partition query without ORDER BY): Results are + buffered per partition step. For ReadMany, all partitions are driven to completion and + merged (single logical page). For queries, pages are emitted as partitions produce them. + Note that within each partition, results arrive in (PartitionKey, ID) ascending order; + only the cross-partition merge is unordered. + +- **Ordered fan-out** (cross-partition query with explicit ORDER BY, future): A k-way merge + streams items from partition heads. A page is emitted when enough items are available or + a partition produces a page boundary. + +- **Single-step plans**: Each turn is one HTTP request, one page. + +### 4.4 Trivial Plan Optimization + +For point operations, the plan model MUST be zero or near-zero overhead compared to the current +direct `execute_operation_pipeline` call. Implementation strategies: + +- **Inline the trivial case**: When the plan is a single `Fetch` step with no dependencies, + the executor can skip graph traversal and directly call `execute_operation_pipeline`. +- **Stack allocation**: Trivial plans can use a fixed-size array or inline representation + rather than heap-allocated `Vec`. + +--- + +## 5. Planner + +### 5.1 Responsibilities + +The Planner transforms a `CosmosOperation` into an `OperationPlan`. It is a synchronous, +deterministic function for most operations, but MAY need to perform async I/O for cross-partition +queries (fetching a backend query plan). + +```rust +pub(crate) struct Planner<'a> { + /// Access to the PK range cache for partition resolution. + pk_range_cache: &'a PartitionKeyRangeCache, +} + +impl<'a> Planner<'a> { + /// Creates an operation plan from a CosmosOperation. + /// + /// For point operations, this is synchronous and trivial. + /// For feed operations, this may need to resolve PK ranges + /// and (for cross-partition queries) fetch a backend query plan. + pub async fn plan( + &self, + operation: &CosmosOperation, + options: &OperationOptions, + continuation: Option<&ContinuationToken>, + // Callback for fetching PK ranges (keeps Planner transport-decoupled). + fetch_pk_ranges: impl Fn(...) -> ..., + // Callback for fetching query plans (keeps Planner transport-decoupled). + fetch_query_plan: impl Fn(...) -> ..., + ) -> azure_core::Result { + // ... + } +} +``` + +### 5.2 Planning Logic by Operation Type + +| Operation | Targeting | Plan Strategy | +|-----------|-----------|---------------| +| ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Fetch` step. Trivial. | +| CreateDatabase, ReadContainer, etc. | `None` | Single `Fetch` step. Trivial. | +| QueryItems (single partition) | `PartitionKey` | Single `Fetch` step. Paginated. | +| ReadAllItems (single partition) | `PartitionKey` | Single `Fetch` step. Paginated. | +| QueryItems (cross-partition) | `AllRanges` | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. May fetch query plan. | +| ReadMany | `AllRanges` | Group items by PK range → N `Fetch` steps + `UnorderedMerge`. No pagination. | +| ReadAllItems (cross-partition) | `AllRanges` | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. Paginated. | +| ChangeFeed (future) | varies | TBD | + +### 5.3 Operation Decomposition: From One `CosmosOperation` to Many + +A key responsibility of the Planner is decomposing a single caller-provided `CosmosOperation` +into multiple targeted `CosmosOperation` instances — one per partition key range — that each +flow through `execute_operation_pipeline` independently. This section illustrates the full +decomposition for two representative operations. + +#### Example: Cross-Partition Query + +The caller creates a single operation: + +```rust +let op = CosmosOperation::query_items_cross_partition( + container.clone(), + "SELECT * FROM c WHERE c.status = 'active'", +); +// op.target == OperationTarget::AllRanges +// op.payload == OperationPayload::Query { query: "SELECT ...", parameters: None } +``` + +The Planner resolves the container's partition key ranges (say, ranges "0", "1", "2") and +produces a plan with **three separate `CosmosOperation`** instances: + +```text +Caller's CosmosOperation + target: AllRanges + payload: Query { "SELECT * FROM c WHERE c.status = 'active'" } + │ + ▼ + ┌─── Planner ───┐ + │ Resolve PK │ + │ ranges: 0,1,2 │ + └───────┬────────┘ + ┌───────────┼───────────┐ + ▼ ▼ ▼ + CosmosOperation CosmosOperation CosmosOperation + type: Query type: Query type: Query + target: target: target: + PkRangeId("0") PkRangeId("1") PkRangeId("2") + payload: payload: payload: + Query{same SQL} Query{same SQL} Query{same SQL} + │ │ │ + ▼ ▼ ▼ + execute_operation execute_operation execute_operation + _pipeline() _pipeline() _pipeline() + │ │ │ + ▼ ▼ ▼ + CosmosResponse CosmosResponse CosmosResponse + │ │ │ + └───────────────┼───────────────┘ + ▼ + UnorderedMerge + │ + ▼ + FeedResponsePage +``` + +Each decomposed `CosmosOperation` carries the same query payload but is **retargeted** to a +specific PK range ID. The operation pipeline handles region failover, retry, and auth for each +independently. + +#### Example: ReadMany + +The caller creates one operation with 5 items across 3 PK ranges: + +```rust +let op = CosmosOperation::read_many(container.clone(), vec![ + ("id_a".into(), PartitionKey::from("pk_a")), + ("id_b".into(), PartitionKey::from("pk_b")), + ("id_c".into(), PartitionKey::from("pk_c")), + ("id_d".into(), PartitionKey::from("pk_d")), + ("id_e".into(), PartitionKey::from("pk_e")), +]); +``` + +The Planner computes EPKs for each partition key, groups by PK range, and produces: + +```text +Caller's CosmosOperation + target: AllRanges + payload: ReadMany { items: [(id_a,pk_a), (id_b,pk_b), (id_c,pk_c), (id_d,pk_d), (id_e,pk_e)] } + │ + ▼ + ┌─── Planner ──────────────────────────────────────────┐ + │ EPK(pk_a),EPK(pk_b) → PK range "0" │ + │ EPK(pk_c) → PK range "1" (single item!) │ + │ EPK(pk_d),EPK(pk_e) → PK range "2" │ + └───────┬──────────────────────────────────────────────┘ + ┌───────────┼───────────┐ + ▼ ▼ ▼ + CosmosOperation CosmosOperation CosmosOperation + type: Query type: Read type: Query + target: target: target: + PkRangeId("0") PK(pk_c) PkRangeId("2") + payload: payload: payload: + Body{query on None (point Body{query on + (pk_a,id_a), read of id_c) (pk_d,id_d), + (pk_b,id_b)} (pk_e,id_e)} +``` + +Note two things: +1. The ReadMany query for each PK range filters on **both partition key and ID**, because + ID alone is not unique — only (PartitionKey, ID) is unique within a container. +2. PK range "1" contains only a single item, so the Planner **optimizes it to a point read** + (`OperationType::Read` with `OperationTarget::PartitionKey`), avoiding query overhead. + +Each decomposed operation then flows through `execute_operation_pipeline` independently. + +### 5.4 Query Plan Fetching + +For cross-partition queries, the Planner may need a backend query plan to determine: +- Which partitions to target +- Whether the query requires client-side sort/aggregate +- Optimized partition routing + +The Planner uses a **callback** to fetch the query plan, keeping it transport-decoupled. The +callback internally calls `execute_operation_pipeline` (not `execute_operation`), avoiding +re-entry into the Planner. The `OperationType::QueryPlan` variant already exists for this. + +```rust +// The Planner calls this callback, which the driver wires to +// execute_operation_pipeline directly (bypassing the Planner). +async fn fetch_query_plan( + operation: &CosmosOperation, + options: &OperationOptions, +) -> azure_core::Result { + let query_plan_op = CosmosOperation::query_plan( + operation.container().unwrap().clone(), + /* query text from operation payload */ + ); + let response = execute_operation_pipeline(query_plan_op, options, ...).await?; + BackendQueryPlan::from_response(response) +} +``` + +This avoids the recursion concern: `fetch_query_plan` calls `execute_operation_pipeline` +directly, which is the internal pipeline function, not the public `execute_operation` that +goes through the Planner. + +### 5.5 Resuming from a Continuation Token + +When a `ContinuationToken` is provided, the Planner uses it to reconstruct the plan state: + +1. Validate the token version and operation compatibility. +2. Restore per-range continuation state. +3. Skip ranges that are already completed. +4. If a PK range ID in the token no longer exists (partition split), re-resolve using the + EPK range bounds stored in the token and map to the new PK range(s). + +--- + +## 6. Plan Executor + +### 6.1 Core Execution Loop + +The Plan Executor runs an `OperationPlan` and produces pages of results. + +```rust +pub(crate) struct PlanExecutor { + plan: OperationPlan, + /// Per-step state (continuation, completion status). + step_states: Vec, + /// Concurrency control for fan-out. + concurrency_limit: usize, + /// OpenTelemetry context for span linking. + trace_context: FeedTraceContext, +} + +impl PlanExecutor { + /// Executes the next turn, producing one page of results. + /// + /// Returns `None` when the plan is complete (no more pages). + /// + /// For non-paginating plans (ReadMany), the first call drives all + /// steps to completion and returns the merged result. Subsequent + /// calls return `None`. + pub async fn next_turn( + &mut self, + driver_context: &DriverContext, + ) -> azure_core::Result> { + // ... + } + + /// Serializes the current execution state into a continuation token. + /// + /// Returns `None` if the plan is complete or does not support pagination. + pub fn continuation_token(&self) -> Option { + // ... + } +} +``` + +### 6.2 Turn Execution + +Each call to `next_turn`: + +1. **Emit OpenTelemetry span** for this turn (child of the feed operation span, linked to root). +2. **Identify runnable steps** — steps whose dependencies are satisfied. +3. **Execute runnable steps concurrently** (up to concurrency cap), each via + `execute_operation_pipeline`. +4. **Collect results** from completed steps. +5. **Advance continuation state** for steps that returned server continuations. +6. **Execute dependent steps** (e.g., `UnorderedMerge`) when their inputs are ready. +7. **Produce the page** from the output step's result. +8. **Update step states** for the next turn. + +### 6.3 Concurrency Control + +Fan-out steps are executed with a configurable concurrency cap: + +```rust +/// Maximum number of concurrent partition key range fetches. +/// +/// Defaults to `min(num_pk_ranges, 10)`. Configurable via +/// `FeedOperationOptions::max_concurrency`. +concurrency_limit: usize, +``` + +The executor uses a semaphore or similar mechanism to limit concurrent +`execute_operation_pipeline` calls. Each concurrent call independently goes through the +full operation pipeline (region failover, retry, etc.). + +### 6.4 ReadMany Execution Details + +ReadMany is the initial target. Its execution: + +1. **Planner** groups `(id, partition_key)` pairs by PK range (via `PartitionKeyRangeCache`). +2. **Plan** has N `Fetch` steps (one per PK range) + one `UnorderedMerge` step. +3. **Executor** runs all `Fetch` steps concurrently (up to concurrency limit). +4. Each `Fetch` step sends a query to its PK range. The query body encodes **both the item IDs + and the partition keys** for that range, because ID alone is not unique — only the + (PartitionKey, ID) pair is unique within a container. If the response includes a server + continuation, the executor continues fetching that range until all items are retrieved. +5. **UnorderedMerge** step concatenates results from all ranges. +6. Returns a single `FeedResponsePage` containing all items. +7. Subsequent `next_turn` calls return `None` (ReadMany does not paginate). + +**Optimization:** When a PK range contains only a single item, the Planner optimizes the +`Fetch` step to a point read instead of a query (see §4.2). + +**Semantics:** +- **Missing items**: Items not found are silently omitted from the result. The response does + not indicate which items were not found. +- **Order**: Output order is NOT guaranteed to match input order. Items are grouped by + partition key range. +- **Partial failure**: If any PK range fetch fails after exhausting retries, the entire + ReadMany operation fails. Partial results are not returned. + +### 6.5 Backpressure & Cancellation + +- **Caller drops the `FeedPager`**: In-flight `execute_operation_pipeline` futures are + cancelled via standard Rust drop semantics. The executor does not buffer results beyond + what is needed for the current turn. +- **Memory bounds**: The executor does not buffer more than `concurrency_limit` concurrent + page results. For ReadMany (which buffers all results), the total buffered data is bounded + by the total size of all items — the caller controls this by the size of the input list. +- **Cancellation mid-turn**: If the caller cancels (drops the future) during a turn, any + in-flight HTTP requests are dropped. The continuation token from the *previous* completed + turn remains valid for resumption. + +--- + +## 7. Continuation Tokens + +### 7.1 Token Structure + +```rust +/// A typed continuation token for resuming a feed operation. +/// +/// Opaque to callers. Serializes to a string via `Display` and +/// deserializes via `FromStr`. The internal representation is +/// versioned and validated on deserialization. +#[derive(Clone, Debug)] +pub struct ContinuationToken { + inner: ContinuationTokenInner, +} + +/// Internal token representation (not public). +#[derive(Clone, Debug, Serialize, Deserialize)] +struct ContinuationTokenInner { + /// Token format version for forward/backward compatibility. + version: u32, + + /// Container identity (RID, not name) to detect container recreation. + container_rid: String, + + /// The operation kind this token is valid for. + operation_kind: ContinuationOperationKind, + + /// Per-partition-range state. + ranges: Vec, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +struct RangeContinuation { + /// The EPK bounds of this range (stable across splits). + min_inclusive_epk: String, + max_exclusive_epk: String, + + /// The PK range ID at the time the continuation was created. + /// Used as a hint for fast resolution; falls back to EPK bounds + /// if the range has split. + pk_range_id: String, + + /// Server-provided continuation token for this range. + /// `None` means this range is completed. + server_continuation: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +enum ContinuationOperationKind { + Query, + ReadFeed, + // Future: ChangeFeed, etc. +} +``` + +### 7.2 Serialization + +`ContinuationToken` implements `Display` and `FromStr`. The wire format is base64-encoded JSON: + +```rust +impl Display for ContinuationToken { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let json = serde_json::to_vec(&self.inner).map_err(|_| fmt::Error)?; + let encoded = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(&json); + f.write_str(&encoded) + } +} + +impl FromStr for ContinuationToken { + type Err = azure_core::Error; + + fn from_str(s: &str) -> Result { + let decoded = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(s) + .map_err(|e| azure_core::Error::new(ErrorKind::DataConversion, e))?; + let inner: ContinuationTokenInner = serde_json::from_slice(&decoded) + .map_err(|e| azure_core::Error::new(ErrorKind::DataConversion, e))?; + // Version check + if inner.version > CURRENT_TOKEN_VERSION { + return Err(azure_core::Error::with_message( + ErrorKind::DataConversion, + "continuation token version is newer than this SDK supports", + )); + } + Ok(Self { inner }) + } +} +``` + +### 7.3 Compatibility Contract + +A continuation token is **invalidated** by: + +1. **Container recreation** — The token's `container_rid` won't match the new container's RID. +2. **Token version mismatch** — A token produced by a newer SDK version may not be readable + by an older version. +3. **Operation kind mismatch** — Resuming a `Query` continuation for a `ReadFeed` operation + is rejected. + +A continuation token **survives**: + +1. **Partition splits** — The token stores EPK bounds, not just PK range IDs. On resume, the + Planner re-resolves EPK bounds to current PK range IDs. +2. **SDK version upgrades** — The token is versioned. Older token versions are supported by + newer SDKs (backward compatible deserialization). +3. **Process boundaries** — The token is a self-contained string, safe to send to a browser + and back. + +### 7.4 What the Token Does NOT Encode + +- **Query text or parameters** — The caller must provide an equivalent `CosmosOperation`. +- **Session tokens** — Session consistency is not preserved across process boundaries via + the continuation token. The driver resolves session tokens from the `SessionManager` cache + for each turn independently. +- **Container name or database name** — Only the RID is stored. The caller provides routing + context via the `CosmosOperation`. + +--- + +## 8. OpenTelemetry Integration + +### 8.1 Span Hierarchy + +Feed operations produce the following span structure: + +```text +Feed Operation Span (root) + ├── db.cosmosdb.operation = "query_items" (or "read_many", etc.) + ├── db.cosmosdb.container = "my-container" + ├── db.cosmosdb.feed_operation_id = + │ + ├── Turn 0 Span + │ ├── db.cosmosdb.feed_turn_index = 0 + │ ├── [linked to Feed Operation Span] + │ │ + │ ├── PK Range "0" Fetch Span + │ │ └── (transport pipeline spans) + │ ├── PK Range "1" Fetch Span + │ │ └── (transport pipeline spans) + │ └── UnorderedMerge Span + │ + ├── Turn 1 Span (if paginated) + │ ├── db.cosmosdb.feed_turn_index = 1 + │ └── ... + ... +``` + +### 8.2 Cross-Process Span Linking + +When a feed operation is resumed from a continuation token in a different process: + +1. The original Feed Operation Span is NOT re-opened (it may have ended). +2. A new Feed Operation Span is created in the new process. +3. The continuation token carries the `feed_operation_id` (a UUID). +4. Each Turn Span in the new process includes a **span link** to the original + feed operation ID, enabling distributed tracing tools to connect the turns + across process boundaries. + +### 8.3 Point Operation Spans + +Point operations continue to produce a single span as they do today. The plan/executor layer +does not add additional span nesting for trivial single-step plans. + +--- + +## 9. Error Handling & Partition Splits + +### 9.1 Partition Split During Execution + +When a `Fetch` step receives a 410/1002 (Gone — PartitionKeyRangeGone) response: + +1. **Invalidate** the `PartitionKeyRangeCache` for the affected container. +2. **Re-fetch** the partition key ranges. +3. **Re-plan** the affected step: the original PK range has split into two or more new + ranges. The executor replaces the single `Fetch` step with new `Fetch` steps for each + new range. +4. **Update the `UnorderedMerge` step** (if any) to include the new steps. +5. **Resume execution** with the new steps. + +The continuation token must survive this: since tokens store EPK bounds (not just PK range +IDs), the re-plan can correctly map EPK bounds to the new PK range IDs. + +### 9.2 Error Propagation + +| Error Scenario | Behavior | +|----------------|----------| +| 410/1002 (PartitionKeyRangeGone) | Re-plan affected range(s), retry. | +| 429 (Throttled) | Handled by transport pipeline (backoff + retry). | +| 503 (Service Unavailable) | Handled by operation pipeline (region failover). | +| 404 (Not Found) — container | Fail the entire feed operation. | +| 404 (Not Found) — item in ReadMany | Item omitted from results (not an error). | +| Transient network error | Handled by transport pipeline (retry). | +| Invalid continuation token | Fail with `ErrorKind::DataConversion`. | + +### 9.3 Partial Failure in Fan-Out + +For ReadMany and cross-partition queries, if one PK range fails after exhausting all retries +(transport + operation pipeline), the entire feed operation fails. Partial results from +successful ranges are NOT returned. + +**Rationale:** Returning partial results would require the caller to distinguish between +"all items fetched" and "some items fetched, some failed" — a complex API that most callers +don't want. If partial results are needed in the future, they can be exposed via a separate +API or option. + +--- + +## 10. API Semantics & Invariants + +### 10.1 Public API + +```rust +impl CosmosDriver { + /// Executes a point operation (read, write, delete). + /// + /// Internally, this creates a trivial single-step plan and executes it. + /// The overhead is negligible compared to the HTTP round-trip. + pub async fn execute_operation( + &self, + operation: CosmosOperation, + options: OperationOptions, + ) -> azure_core::Result { + // Plan → Execute → drain single page → return CosmosResponse + } + + /// Executes a feed operation (query, read-many, read-all). + /// + /// Returns a `FeedPager` that yields pages of results. The caller + /// pulls pages by calling `next_page()`. Each page includes a + /// continuation token for resumption. + pub async fn execute_feed_operation( + &self, + operation: CosmosOperation, + options: FeedOperationOptions, + ) -> azure_core::Result { + // Plan → wrap executor in FeedPager → return + } +} +``` + +### 10.2 FeedPager + +`FeedPager` is the public-facing page iterator. It wraps the internal `PlanExecutor` and +provides a stable API that does not expose plan/executor internals. + +```rust +/// An iterator over pages of feed operation results. +/// +/// Created by [`CosmosDriver::execute_feed_operation`]. Yields pages +/// of results until the operation is complete. +/// +/// Dropping the `FeedPager` cancels any in-flight requests. +pub struct FeedPager { + executor: PlanExecutor, +} + +impl FeedPager { + /// Fetches the next page of results. + /// + /// Returns `Ok(None)` when no more pages are available. + pub async fn next_page(&mut self) -> azure_core::Result> { + self.executor.next_turn(/* ... */).await + } + + /// Returns the continuation token for the current position. + /// + /// The token can be serialized to a string and used to resume the + /// operation later by passing it to `FeedOperationOptions::with_continuation`. + /// + /// Returns `None` if the operation is complete or does not support + /// pagination (e.g., ReadMany). + pub fn continuation_token(&self) -> Option { + self.executor.continuation_token() + } +} +``` + +### 10.3 FeedResponsePage + +```rust +/// A single page of results from a feed operation. +/// +/// Contains raw response bytes and metadata. The higher-level SDK +/// handles deserialization into typed items. +pub struct FeedResponsePage { + /// Raw response body (the items array as JSON bytes). + body: Vec, + + /// Cosmos-specific response headers (RU charge, session token, etc.). + headers: CosmosResponseHeaders, + + /// Diagnostics for this page (may aggregate multiple sub-request diagnostics). + diagnostics: Arc, +} +``` + +### 10.4 Ordering Guarantees + +| Operation | Order Guarantee | +|-----------|-----------------| +| ReadMany | Unordered across partitions. Within each partition, (PartitionKey, ID) ascending. | +| Single-partition query | Server-determined order: (PartitionKey, ID) ascending, or as specified by ORDER BY. | +| Cross-partition query (no ORDER BY) | Within each partition, (PartitionKey, ID) ascending. Across partitions, unordered (partition results are concatenated by `UnorderedMerge`). | +| Cross-partition query (ORDER BY) | Globally ordered per ORDER BY clause (future work: `OrderedMerge` k-way merge). | +| ReadFeed (single partition) | (PartitionKey, ID) ascending. | +| ReadFeed (cross-partition) | Within each partition, (PartitionKey, ID) ascending. Across partitions, unordered. | + +### 10.5 Page Boundaries + +Page boundaries are determined by: +- **Server-side max item count**: The server may return fewer items than requested. +- **Client-side max item count**: Configurable via `FeedOperationOptions::max_item_count`. +- **Server continuation**: A page boundary occurs whenever the server returns a continuation + token. + +For ReadMany, there is exactly one logical page (the merged result), regardless of how many +server-side pages were consumed internally. + +--- + +## 11. Configuration Surface + +### 11.1 FeedOperationOptions + +```rust +/// Options specific to feed operations. +/// +/// Extends `OperationOptions` with feed-specific settings. +pub struct FeedOperationOptions { + /// Base operation options (retry, timeout, consistency, etc.). + base: OperationOptions, + + /// Maximum number of items per page. + /// If not set, the server default applies. + max_item_count: Option, + + /// Maximum number of concurrent partition key range fetches. + /// Default: min(num_pk_ranges, 10). + max_concurrency: Option, + + /// Continuation token for resuming a previous operation. + continuation: Option, +} +``` + +### 11.2 Layered Options Resolution + +Feed operation options follow the same layered resolution as existing operation options: + +1. `FeedOperationOptions` (per-call) +2. `DriverOptions` (per-driver) +3. `CosmosDriverRuntime` (global) +4. Environment variables + +The `max_concurrency` and `max_item_count` fields follow the same precedence. + +--- + +## 12. Performance & Non-Regression + +### 12.1 Point Operation Overhead + +The plan model MUST NOT regress point operation performance. Requirements: + +- **No heap allocation** for trivial plans beyond what `execute_operation` does today. +- **No additional async machinery** (no spawning, no channels) for single-step plans. +- **Benchmark**: Point operation latency with the plan model must be within 1% of the + current direct `execute_operation_pipeline` call. + +Implementation: The `execute_operation` method detects trivial operations (based on +`OperationType` and `OperationTarget`) and calls `execute_operation_pipeline` directly, +bypassing the Planner/Executor entirely. The plan model is only instantiated for feed +operations. + +### 12.2 Fan-Out Memory Bounds + +For ReadMany: +- Buffered data is bounded by the total size of all items in the response. +- The executor does not buffer more than `max_concurrency` in-flight requests. + +For paginated queries: +- Each turn buffers at most one page per in-flight partition fetch. +- Total buffer: `max_concurrency × max_page_size`. + +--- + +## 13. Migration Plan + +### Phase 1: OperationType / OperationPayload Refactor + +1. Add `OperationPayload` enum. +2. Add `OperationTarget` enum. +3. Update `CosmosOperation` to use `OperationPayload` and `OperationTarget`. +4. Update factory methods. +5. Update transport pipeline request builder to extract body from `OperationPayload`. +6. Remove `body: Option>` from `CosmosOperation`. +7. Update all callers (driver internals, tests, `azure_data_cosmos` bridge). + +**This is a breaking internal change.** The `body` field and `partition_key` field on +`CosmosOperation` are replaced. All internal callers must be updated. + +### Phase 2: Plan Infrastructure + +1. Implement `OperationPlan`, `PlanStep`, `StepId`. +2. Implement `Planner` with trivial single-step planning (point ops only). +3. Implement `PlanExecutor` for single-step plans. +4. Wire `execute_operation` through Plan → Execute path (with fast-path bypass). +5. Validate no performance regression via benchmarks. + +### Phase 3: ReadMany + +1. Implement ReadMany planning in `Planner`: + - Group items by PK range (via `PartitionKeyRangeCache`). + - Create fan-out `Fetch` steps + `UnorderedMerge` step. +2. Implement `UnorderedMerge` step execution in `PlanExecutor`. +3. Implement `FeedPager` and `FeedResponsePage`. +4. Add `execute_feed_operation` to `CosmosDriver`. +5. Integration tests with partition splits. + +### Phase 4: Single-Partition Queries + +1. Implement single-partition query planning. +2. Implement paginated execution (continuation threading). +3. Implement `ContinuationToken` serialization. + +### Phase 5: Cross-Partition Queries + +1. Implement query plan fetching in `Planner`. +2. Implement multi-range query planning. +3. Implement incremental page production for unordered queries. + +### Phase 6: Advanced Query Features (Future) + +1. ORDER BY merge-sort. +2. Aggregation. +3. Change feed. + +--- + +## 14. Testing Strategy + +### 14.1 Unit Tests + +| Test Area | Cases | +|-----------|-------| +| Planner — point ops | Verify trivial single-step plan for each point operation type. | +| Planner — ReadMany | Verify correct grouping by PK range. Items spread across ranges. | +| Planner — single-partition query | Verify single `Fetch` step with correct targeting. | +| PlanExecutor — single step | Execute trivial plan, verify result matches direct pipeline call. | +| PlanExecutor — fan-out | Execute multi-step plan with mock pipeline, verify merge. | +| PlanExecutor — concurrency | Verify concurrency cap is respected (at most N concurrent fetches). | +| ContinuationToken — round-trip | Serialize to string, deserialize back, verify equality. | +| ContinuationToken — version compat | Older version tokens deserialize correctly. | +| ContinuationToken — split recovery | Token with stale PK range ID maps to new ranges via EPK bounds. | +| OperationTarget — mutual exclusivity | Verify builder rejects invalid combinations. | + +### 14.2 Integration Tests + +| Test Area | Cases | +|-----------|-------| +| ReadMany — basic | Read 10 items across 3 partitions, verify all returned. | +| ReadMany — missing items | Read items where some don't exist, verify present items returned. | +| ReadMany — single partition | All items in one partition, verify no unnecessary fan-out. | +| ReadMany — partition split | Trigger split during ReadMany, verify re-plan and completion. | +| ReadMany — large set | Read 1000 items, verify server-side pagination within each range works. | +| Query — single partition | Execute paginated query, verify continuation threading. | +| Query — resume | Execute query, get continuation, resume in new FeedPager, verify continues. | +| Diagnostics | Verify RU charges are aggregated across fan-out steps. | +| Throughput control | Verify fan-out respects throughput control group limits. | + +### 14.3 Performance Tests + +| Test Area | Metric | +|-----------|--------| +| Point op overhead | Latency regression < 1% vs. direct `execute_operation_pipeline`. | +| ReadMany fan-out | Latency scales sub-linearly with partition count (concurrency works). | +| Memory bounds | Peak memory for ReadMany of N items is O(N × item_size). | + +--- + +## 15. Future Work + +### 15.1 Change Feed + +The change feed is a specialized feed operation with unique characteristics: +- Start-from-beginning, start-from-now, or start-from-timestamp. +- Lease-based partition assignment (for multi-consumer scenarios). +- Scoped to feed ranges (EPK ranges). +- Incremental mode vs. full-fidelity mode. + +The current spec reserves extension points in `OperationPayload`, `OperationTarget`, +`ContinuationOperationKind`, and `PlanStep` for change feed support. + +### 15.2 ORDER BY Merge-Sort + +Cross-partition queries with ORDER BY require a k-way merge of sorted partition streams. +This will be implemented as a `Sort` variant of `PlanStep` that consumes partition `Fetch` +step heads and produces globally ordered pages. + +### 15.3 Aggregation + +Queries with aggregation functions (COUNT, SUM, AVG, etc.) require client-side accumulation +across partitions. This will be implemented as an `Aggregate` variant of `PlanStep`. + +### 15.4 Payload Awareness + +For sort and aggregation, the driver must understand feed response envelopes (the JSON +structure containing the items array, count, etc.). This will require a light JSON parsing +layer in the executor, not full item deserialization. + +### 15.5 Hedging for Feed Operations + +The existing hedging mechanism (speculative execution in secondary regions) could be extended +to individual plan steps, allowing fan-out fetches to hedge independently. From 2956b34d808798d852e8cf524221953ba7dbcb32 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Wed, 22 Apr 2026 22:01:06 +0000 Subject: [PATCH 02/29] Revise feed operations spec design Incorporates Java SDK comparison findings: O(1) continuation tokens using nested ResumeState tree, unified execute_operation API, simplified OperationPlan enum (Trivial/MultiStep), restored PartitionKey targeting, ORDER BY filter injection via placeholder, and renamed execute_operation_pipeline to execute_single_operation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../docs/FEED_OPERATIONS_SPEC.md | 845 ++++++++++++------ 1 file changed, 590 insertions(+), 255 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index fe4ed144f19..46a0fe83fb0 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -95,14 +95,17 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ┌─────────────────────────────────────────────────────────────────────────────────┐ │ CosmosDriver │ │ │ -│ execute_operation(op, opts) → CosmosResponse [point operations] │ -│ execute_feed_operation(op, opts) → FeedPager [feed operations] │ +│ execute_operation(op, opts) → CosmosResponse │ │ │ -│ Both internally: │ +│ A single entry point for ALL operations (point and feed). │ +│ Returns a CosmosResponse which optionally includes a continuation │ +│ token. Point reads never have one; feed operations may. │ +│ The SDK layer decides which operations to expose as pagers. │ +│ │ +│ Internally: │ │ 1. Planner creates an OperationPlan │ -│ 2. PlanExecutor runs the plan │ -│ 3. Point ops: executor drains single page, returns CosmosResponse │ -│ 4. Feed ops: executor is wrapped in FeedPager for caller iteration │ +│ 2. PlanExecutor runs one turn of the plan │ +│ 3. Returns CosmosResponse (with optional continuation token) │ │ │ │ ┌──────────────────────────────────────────────────────────────────────────┐ │ │ │ PLANNER │ │ @@ -111,7 +114,7 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i │ │ Output: OperationPlan │ │ │ │ │ │ │ │ Responsibilities: │ │ -│ │ ┌─ Determine targeting (single PK, EPK range, all ranges) │ │ +│ │ ┌─ Determine targeting (point EPK, sub-range, full key space) │ │ │ │ ├─ For ReadMany: group items by PK range, create fan-out steps │ │ │ │ ├─ For cross-partition query: fetch backend query plan, create steps │ │ │ │ ├─ For single-partition ops: create single-step plan │ │ @@ -123,24 +126,23 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i │ │ PLAN EXECUTOR │ │ │ │ │ │ │ │ Input: OperationPlan │ │ -│ │ Output: Stream of FeedResponsePage (or single page for point ops) │ │ +│ │ Output: CosmosResponse (single turn / single page) │ │ │ │ │ │ │ │ Responsibilities: │ │ │ │ ┌─ Execute plan steps with configurable concurrency │ │ -│ │ ├─ Each step calls execute_operation_pipeline() for HTTP │ │ -│ │ ├─ Manage continuation state across turns │ │ +│ │ ├─ Each step calls execute_single_operation() for HTTP │ │ │ │ ├─ Handle partition splits (re-plan affected ranges) │ │ │ │ ├─ Enforce concurrency caps for fan-out │ │ │ │ ├─ Integrate with throughput control │ │ -│ │ ├─ Emit OpenTelemetry spans per turn │ │ -│ │ └─ Produce continuation tokens for serialization │ │ +│ │ ├─ Emit OpenTelemetry spans │ │ +│ │ └─ Produce continuation token in response (if more pages remain) │ │ │ └──────────────────────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ ┌──────────────────────────────────────────────────────────────────────────┐ │ │ │ OPERATION PIPELINE (existing) │ │ │ │ │ │ -│ │ execute_operation_pipeline() — unchanged │ │ +│ │ execute_single_operation() — unchanged │ │ │ │ Handles: region failover, session tokens, transport retry, auth, │ │ │ │ 429 backoff, diagnostics │ │ │ └──────────────────────────────────────────────────────────────────────────┘ │ @@ -149,15 +151,38 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ### Layer Separation +The existing `execute_operation_pipeline` function is renamed to **`execute_single_operation`** +in this spec. It remains the internal entry point for executing a single Cosmos DB operation +through the operation pipeline (region failover, session tokens, transport retry, auth, 429 +backoff, diagnostics). The feed operations layer calls `execute_single_operation` for each +individual HTTP request within a plan. + | Concern | Component | Location | |---------|-----------|----------| | Operation intent & payload | `CosmosOperation` | `models/cosmos_operation.rs` | | Plan creation | `Planner` | `driver/feed/planner.rs` (new) | | Plan model | `OperationPlan`, `PlanStep` | `driver/feed/plan.rs` (new) | | Plan execution | `PlanExecutor` | `driver/feed/executor.rs` (new) | -| Public pager | `FeedPager` | `driver/feed/pager.rs` (new) | | Continuation state | `ContinuationToken` | `models/continuation_token.rs` (new) | -| Per-step HTTP execution | `execute_operation_pipeline` | `driver/pipeline/` (existing) | +| Per-step HTTP execution | `execute_single_operation` | `driver/pipeline/` (existing) | + +### Open Issue: Re-Planning on Every Page + +Because `execute_operation` is stateless, the driver must re-plan the operation on every +call — including subsequent pages of a paginated feed. The Planner uses the continuation +token to reconstruct the plan state, but still performs the full planning step (PK range +resolution, and for cross-partition queries, potentially a backend query plan fetch) on each +page. + +For in-process callers (the common case), this is wasteful: the SDK crate calls +`execute_operation` in a loop, and the plan doesn't change between pages (barring partition +splits). A future optimization could allow `CosmosResponse` and/or `CosmosOperation` to +carry a **cached `OperationPlan`** so that subsequent requests skip re-planning when the +plan is still valid. The cached plan would be invalidated on partition splits (410/1002) or +account metadata changes, falling back to a full re-plan. + +This optimization is not required for correctness — the stateless model works correctly +today — but should be considered for performance-sensitive workloads with many small pages. --- @@ -235,41 +260,55 @@ pub struct CosmosOperation { ### 3.2 OperationTarget -Partition targeting is currently a single `Option` field. Feed operations introduce -additional targeting modes. These are mutually exclusive, so they become an enum: +Partition targeting is currently a single `Option` field. Feed operations require +richer targeting. The targeting enum has three modes: no partition scope, a specific logical +partition key (needed for point reads where the raw partition key value must be sent to the +backend), or an EPK range for feed operations spanning one or more partitions. ```rust /// How the operation is targeted to partitions. -/// -/// Determines which partition key range(s) the operation executes against. -/// Only one targeting mode is active per operation. #[derive(Clone, Debug)] pub enum OperationTarget { - /// No partition targeting (account-level or database-level operations). + /// No partition targeting (account-level or database-level operations, + /// such as CreateDatabase or ReadContainer). None, /// Target a specific logical partition key. - /// Used for: single-partition reads, writes, queries. + /// + /// Used for point operations (read, create, delete, upsert, replace) + /// and single-partition feed operations where the raw partition key + /// value must be included in the request headers. PartitionKey(PartitionKey), - /// Target a specific effective partition key range. - /// Used for: scoped feed operations on a sub-range. - EpkRange { - min_inclusive: EffectivePartitionKey, - max_exclusive: EffectivePartitionKey, - }, - - /// Target a specific partition key range by its server-assigned ID. - /// Used for: resuming from a continuation that recorded the range ID. - PkRangeId(String), + /// Target an effective partition key range. + /// + /// Used for feed operations that span one or more partitions. + /// Uses the existing `EpkRange` type from + /// `models::range`. + /// + /// The pipeline resolves the EPK range to the owning PK range ID(s) via + /// the `PartitionKeyRangeCache` at execution time. + EpkRange(EpkRange), +} - /// Target all partition key ranges. - /// Semantically equivalent to `EpkRange { min: "00", max: "FF" }`. - /// Used for: cross-partition queries, read-all-items, read-many. - AllRanges, +impl OperationTarget { + /// The full key space: targets all partition key ranges. + pub fn all_ranges() -> Self { + Self::EpkRange(EpkRange::new( + EffectivePartitionKey::MIN, + EffectivePartitionKey::MAX, + true, + false, + )) + } } ``` +**Future optimization:** `EpkRange` could gain an optional PK range ID hint to skip the +cache lookup when the mapping is already known (e.g., from a previous routing decision or a +cached plan). The hint would be advisory — the pipeline would fall back to EPK-based +resolution if the hint is stale after a partition split. + ### 3.3 Factory Method Updates Existing factory methods are updated to use `OperationPayload` and `OperationTarget`: @@ -281,11 +320,13 @@ impl CosmosOperation { let partition_key = item.partition_key().clone(); Self::new(OperationType::Read, item) .with_target(OperationTarget::PartitionKey(partition_key)) - // No payload needed — item ID is in the resource reference. } /// Creates an item. Use `with_body()` to provide the document JSON. - pub fn create_item(container: ContainerReference, partition_key: PartitionKey) -> Self { + pub fn create_item( + container: ContainerReference, + partition_key: PartitionKey, + ) -> Self { let resource_ref = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); @@ -320,7 +361,7 @@ impl CosmosOperation { .with_resource_type(ResourceType::Document) .into_feed_reference(); Self::new(OperationType::Query, resource_ref) - .with_target(OperationTarget::AllRanges) + .with_target(OperationTarget::all_ranges()) .with_payload(OperationPayload::Query { query: query.into(), parameters: None, @@ -336,7 +377,7 @@ impl CosmosOperation { .with_resource_type(ResourceType::Document) .into_feed_reference(); Self::new(OperationType::Query, resource_ref) - .with_target(OperationTarget::AllRanges) + .with_target(OperationTarget::all_ranges()) .with_payload(OperationPayload::ReadMany { items }) } } @@ -360,9 +401,9 @@ it reaches the transport pipeline, so the transport never sees a `ReadMany` payl ### 4.1 Plan Model -An `OperationPlan` is a directed acyclic graph (DAG) of `PlanStep` nodes. Each step represents -a unit of work that produces a partial result. Steps may depend on other steps (for merge/sort), -and may produce continuations for subsequent turns. +An `OperationPlan` describes the steps needed to execute an operation. It is an enum with +two variants: `Trivial` for single-step plans (stack-allocated, no heap overhead) and +`MultiStep` for fan-out plans. ```rust /// A plan for executing an operation. @@ -370,37 +411,42 @@ and may produce continuations for subsequent turns. /// Plans range from trivial (single step for a point read) to complex /// (fan-out across partition key ranges with merge). The plan is created /// by the Planner and executed by the PlanExecutor. -pub(crate) struct OperationPlan { - /// The steps in this plan, indexed by StepId. - steps: Vec, - - /// Which step produces the final output. - /// For single-step plans, this is step 0. - /// For fan-out plans, this is typically a merge step. - output_step: StepId, - - /// Whether this plan supports pagination (multiple turns). - /// ReadMany plans do not paginate; query plans do. - paginates: bool, +pub(crate) enum OperationPlan { + /// A single-step plan. Stack-allocated, no heap overhead. + /// Used for point operations and single-partition feed operations. + Trivial(PlanStep), + + /// A multi-step plan. The last step in the Vec is the output step. + /// Used for fan-out operations (ReadMany, cross-partition queries). + MultiStep { + steps: Vec, + }, } -/// A unique identifier for a step within an OperationPlan. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub(crate) struct StepId(usize); - /// A single step in an operation plan. pub(crate) enum PlanStep { /// Execute a single HTTP request via the operation pipeline. /// - /// This is the leaf step that actually talks to the Cosmos DB service. - /// It carries a `CosmosOperation` configured for a specific PK range. + /// The `operation` carries the **unrewritten** query from the backend + /// query plan, which may contain the `{documentdb-formattableorderbyquery-filter}` + /// placeholder token. At execution time, the executor replaces this + /// token with the `resume_filter` (if present) via simple string + /// substitution before sending the request. Fetch { /// The operation to execute. Targeted to a specific PK range. + /// For ORDER BY queries, the query text contains the + /// `{documentdb-formattableorderbyquery-filter}` placeholder. operation: CosmosOperation, /// Options for this fetch. options: OperationOptions, /// Server-provided continuation token for this range, if resuming. continuation: Option, + /// A filter expression to inject into the query by replacing + /// `{documentdb-formattableorderbyquery-filter}` in the query text. + /// Set by the Planner when resuming from an `OrderBy` continuation. + /// For example: `"c.name > 'Baker'"` or `"c.name >= 'Baker'"`. + /// `None` for first-page execution or non-ORDER BY queries. + resume_filter: Option, }, /// Merge results from multiple upstream steps with no ordering guarantee. @@ -415,13 +461,13 @@ pub(crate) enum PlanStep { /// sort across partitions. For globally-ordered results, use `OrderedMerge` /// (future). UnorderedMerge { - /// The steps whose results are merged. - inputs: Vec, + /// Indices of the steps whose results are merged. + inputs: Vec, }, // Future variants: - // OrderedMerge { inputs: Vec, order_by: ... }, - // Aggregate { inputs: Vec, aggregation: ... }, + // OrderedMerge { inputs: Vec, order_by: ... }, + // Aggregate { inputs: Vec, aggregation: ... }, } ``` @@ -430,10 +476,11 @@ pub(crate) enum PlanStep { #### Point Operation (ReadItem) ```text -Step 0: Fetch(read_item operation) → output +Trivial(Fetch(read_item operation)) → output ``` -A trivial plan with one step. The executor runs it, gets a `CosmosResponse`, done. +A `Trivial` plan with one `Fetch` step. The executor runs it directly, gets a +`CosmosResponse`, done. No heap allocation. #### ReadMany @@ -448,7 +495,7 @@ The executor runs steps 0–2 concurrently (subject to concurrency cap), each dr server-side pages to completion. Step 3 merges the fully-buffered results. **Optimization:** When a PK range contains only a single item, the Planner MAY optimize -the `Fetch` step to a point read (`OperationType::Read` with `OperationTarget::PartitionKey`) +the `Fetch` step to a point read (`OperationType::Read` with a point EPK range target) instead of a query, avoiding the overhead of query parsing on the backend. #### Single-Partition Query @@ -467,7 +514,7 @@ Step 0: Fetch(query to PK range "0") Step 1: Fetch(query to PK range "1") Step 2: Fetch(query to PK range "2") Step 3: UnorderedMerge(inputs: [0, 1, 2]) → output - (or OrderedMerge for explicit ORDER BY) + (or Step 3: OrderedMerge for explicit ORDER BY) ``` Each turn, the executor advances whichever PK range steps have results available. @@ -498,12 +545,11 @@ steps to complete before emitting a page. Instead: ### 4.4 Trivial Plan Optimization For point operations, the plan model MUST be zero or near-zero overhead compared to the current -direct `execute_operation_pipeline` call. Implementation strategies: +direct `execute_single_operation` call. The `OperationPlan::Trivial` variant ensures this: -- **Inline the trivial case**: When the plan is a single `Fetch` step with no dependencies, - the executor can skip graph traversal and directly call `execute_operation_pipeline`. -- **Stack allocation**: Trivial plans can use a fixed-size array or inline representation - rather than heap-allocated `Vec`. +- **No heap allocation**: The single `PlanStep` is stored inline in the enum, not in a `Vec`. +- **No graph traversal**: The executor matches on `Trivial` and directly calls + `execute_single_operation`. --- @@ -550,16 +596,16 @@ impl<'a> Planner<'a> { | CreateDatabase, ReadContainer, etc. | `None` | Single `Fetch` step. Trivial. | | QueryItems (single partition) | `PartitionKey` | Single `Fetch` step. Paginated. | | ReadAllItems (single partition) | `PartitionKey` | Single `Fetch` step. Paginated. | -| QueryItems (cross-partition) | `AllRanges` | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. May fetch query plan. | -| ReadMany | `AllRanges` | Group items by PK range → N `Fetch` steps + `UnorderedMerge`. No pagination. | -| ReadAllItems (cross-partition) | `AllRanges` | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. Paginated. | +| QueryItems (cross-partition) | `EpkRange` (`all_ranges()`) | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. May fetch query plan. | +| ReadMany | `EpkRange` (`all_ranges()`) | Group items by PK range → N `Fetch` steps + `UnorderedMerge`. No pagination. | +| ReadAllItems (cross-partition) | `EpkRange` (`all_ranges()`) | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. Paginated. | | ChangeFeed (future) | varies | TBD | ### 5.3 Operation Decomposition: From One `CosmosOperation` to Many A key responsibility of the Planner is decomposing a single caller-provided `CosmosOperation` into multiple targeted `CosmosOperation` instances — one per partition key range — that each -flow through `execute_operation_pipeline` independently. This section illustrates the full +flow through `execute_single_operation` independently. This section illustrates the full decomposition for two representative operations. #### Example: Cross-Partition Query @@ -571,31 +617,36 @@ let op = CosmosOperation::query_items_cross_partition( container.clone(), "SELECT * FROM c WHERE c.status = 'active'", ); -// op.target == OperationTarget::AllRanges +// op.target == OperationTarget::all_ranges() (full EPK range ["", "FF")) // op.payload == OperationPayload::Query { query: "SELECT ...", parameters: None } ``` -The Planner resolves the container's partition key ranges (say, ranges "0", "1", "2") and -produces a plan with **three separate `CosmosOperation`** instances: +The Planner first fetches a **backend query plan** from the service (see [§5.4](#54-query-plan-fetching)) +to determine how the query should be distributed across partitions — including whether +client-side sort or aggregation is required. It then resolves the container's partition key +ranges (say, ranges "0", "1", "2") and uses the backend query plan to assemble an +`OperationPlan` with **three separate `CosmosOperation`** instances: ```text Caller's CosmosOperation - target: AllRanges + target: EpkRange ["", "FF") (all_ranges()) payload: Query { "SELECT * FROM c WHERE c.status = 'active'" } │ ▼ - ┌─── Planner ───┐ - │ Resolve PK │ - │ ranges: 0,1,2 │ - └───────┬────────┘ + ┌─── Planner ──────────────────────────────────┐ + │ 1. Fetch backend query plan (via §5.4) │ + │ 2. Resolve PK ranges: 0, 1, 2 │ + │ 3. Assemble plan from query plan + PK ranges │ + └───────┬──────────────────────────────────────┘ ┌───────────┼───────────┐ ▼ ▼ ▼ CosmosOperation CosmosOperation CosmosOperation type: Query type: Query type: Query target: target: target: - PkRangeId("0") PkRangeId("1") PkRangeId("2") + EpkRange EpkRange EpkRange + ["","55") ["55","AA") ["AA","FF") payload: payload: payload: - Query{same SQL} Query{same SQL} Query{same SQL} + Query{rewritten} Query{rewritten} Query{rewritten} │ │ │ ▼ ▼ ▼ execute_operation execute_operation execute_operation @@ -609,12 +660,15 @@ Caller's CosmosOperation UnorderedMerge │ ▼ - FeedResponsePage + CosmosResponse ``` -Each decomposed `CosmosOperation` carries the same query payload but is **retargeted** to a -specific PK range ID. The operation pipeline handles region failover, retry, and auth for each -independently. +Each decomposed `CosmosOperation` is **retargeted** to a specific EPK range. Note that the +query payload may differ from the caller's original SQL: the backend query plan may +**rewrite the query** (e.g., to push down aggregations, add internal projections, or +restructure filters for per-partition execution), and the Planner uses the rewritten query +text in the decomposed operations. The operation pipeline handles region failover, retry, +and auth for each independently. #### Example: ReadMany @@ -634,7 +688,7 @@ The Planner computes EPKs for each partition key, groups by PK range, and produc ```text Caller's CosmosOperation - target: AllRanges + target: EpkRange ["", "FF") (all_ranges()) payload: ReadMany { items: [(id_a,pk_a), (id_b,pk_b), (id_c,pk_c), (id_d,pk_d), (id_e,pk_e)] } │ ▼ @@ -648,7 +702,9 @@ Caller's CosmosOperation CosmosOperation CosmosOperation CosmosOperation type: Query type: Read type: Query target: target: target: - PkRangeId("0") PK(pk_c) PkRangeId("2") + EpkRange EpkRange EpkRange + ["","55") [EPK(pk_c), ["AA","FF") + EPK(pk_c)) payload: payload: payload: Body{query on None (point Body{query on (pk_a,id_a), read of id_c) (pk_d,id_d), @@ -659,9 +715,9 @@ Note two things: 1. The ReadMany query for each PK range filters on **both partition key and ID**, because ID alone is not unique — only (PartitionKey, ID) is unique within a container. 2. PK range "1" contains only a single item, so the Planner **optimizes it to a point read** - (`OperationType::Read` with `OperationTarget::PartitionKey`), avoiding query overhead. + (`OperationType::Read` with a point EPK range), avoiding query overhead. -Each decomposed operation then flows through `execute_operation_pipeline` independently. +Each decomposed operation then flows through `execute_single_operation` independently. ### 5.4 Query Plan Fetching @@ -671,12 +727,12 @@ For cross-partition queries, the Planner may need a backend query plan to determ - Optimized partition routing The Planner uses a **callback** to fetch the query plan, keeping it transport-decoupled. The -callback internally calls `execute_operation_pipeline` (not `execute_operation`), avoiding +callback internally calls `execute_single_operation` (not `execute_operation`), avoiding re-entry into the Planner. The `OperationType::QueryPlan` variant already exists for this. ```rust // The Planner calls this callback, which the driver wires to -// execute_operation_pipeline directly (bypassing the Planner). +// execute_single_operation directly (bypassing the Planner). async fn fetch_query_plan( operation: &CosmosOperation, options: &OperationOptions, @@ -685,12 +741,12 @@ async fn fetch_query_plan( operation.container().unwrap().clone(), /* query text from operation payload */ ); - let response = execute_operation_pipeline(query_plan_op, options, ...).await?; + let response = execute_single_operation(query_plan_op, options, ...).await?; BackendQueryPlan::from_response(response) } ``` -This avoids the recursion concern: `fetch_query_plan` calls `execute_operation_pipeline` +This avoids the recursion concern: `fetch_query_plan` calls `execute_single_operation` directly, which is the internal pipeline function, not the public `execute_operation` that goes through the Planner. @@ -698,11 +754,20 @@ goes through the Planner. When a `ContinuationToken` is provided, the Planner uses it to reconstruct the plan state: -1. Validate the token version and operation compatibility. -2. Restore per-range continuation state. -3. Skip ranges that are already completed. -4. If a PK range ID in the token no longer exists (partition split), re-resolve using the - EPK range bounds stored in the token and map to the new PK range(s). +1. Validate the token version, container RID, and operation kind compatibility. +2. Resolve the current partition key ranges for the container. +3. Use `PartitionMapper` to classify each range relative to the token's target range: + - **Left of target** — ranges whose EPK max ≤ target's EPK min. These are fully drained + and receive no `Fetch` step (for unordered) or receive a filter-only `Fetch` step + (for ORDER BY, filtering past the last returned ORDER BY values). + - **Target range** — the range overlapping the token's EPK bounds. Resumes using the + stored `server_continuation`. If the range has split, the Planner maps EPK bounds to + the child range(s) and assigns the server continuation appropriately. + - **Right of target** — ranges whose EPK min ≥ target's EPK max. Start fresh with no + continuation (for unordered) or with a filter from the ORDER BY resume state. +4. For `OrderedQuery` tokens, extract the `OrderByResumeState` and generate per-partition + query filters based on the last returned ORDER BY values. Attach duplicate-elimination + state (last `_rid`) for the target range. --- @@ -724,24 +789,15 @@ pub(crate) struct PlanExecutor { } impl PlanExecutor { - /// Executes the next turn, producing one page of results. - /// - /// Returns `None` when the plan is complete (no more pages). + /// Executes one turn of the plan, producing a `CosmosResponse`. /// - /// For non-paginating plans (ReadMany), the first call drives all - /// steps to completion and returns the merged result. Subsequent - /// calls return `None`. - pub async fn next_turn( + /// The response includes a continuation token if more pages are available. + /// For non-paginating plans (ReadMany), this drives all steps to completion + /// and returns the merged result with no continuation token. + pub async fn execute( &mut self, driver_context: &DriverContext, - ) -> azure_core::Result> { - // ... - } - - /// Serializes the current execution state into a continuation token. - /// - /// Returns `None` if the plan is complete or does not support pagination. - pub fn continuation_token(&self) -> Option { + ) -> azure_core::Result { // ... } } @@ -749,12 +805,12 @@ impl PlanExecutor { ### 6.2 Turn Execution -Each call to `next_turn`: +Each call to `execute`: 1. **Emit OpenTelemetry span** for this turn (child of the feed operation span, linked to root). 2. **Identify runnable steps** — steps whose dependencies are satisfied. 3. **Execute runnable steps concurrently** (up to concurrency cap), each via - `execute_operation_pipeline`. + `execute_single_operation`. 4. **Collect results** from completed steps. 5. **Advance continuation state** for steps that returned server continuations. 6. **Execute dependent steps** (e.g., `UnorderedMerge`) when their inputs are ready. @@ -769,12 +825,12 @@ Fan-out steps are executed with a configurable concurrency cap: /// Maximum number of concurrent partition key range fetches. /// /// Defaults to `min(num_pk_ranges, 10)`. Configurable via -/// `FeedOperationOptions::max_concurrency`. +/// `OperationOptions::max_concurrency`. concurrency_limit: usize, ``` The executor uses a semaphore or similar mechanism to limit concurrent -`execute_operation_pipeline` calls. Each concurrent call independently goes through the +`execute_single_operation` calls. Each concurrent call independently goes through the full operation pipeline (region failover, retry, etc.). ### 6.4 ReadMany Execution Details @@ -789,8 +845,8 @@ ReadMany is the initial target. Its execution: (PartitionKey, ID) pair is unique within a container. If the response includes a server continuation, the executor continues fetching that range until all items are retrieved. 5. **UnorderedMerge** step concatenates results from all ranges. -6. Returns a single `FeedResponsePage` containing all items. -7. Subsequent `next_turn` calls return `None` (ReadMany does not paginate). +6. Returns a single `CosmosResponse` containing all items (with no continuation token). +7. Subsequent calls with the same operation (no continuation) would re-execute from scratch. **Optimization:** When a PK range contains only a single item, the Planner optimizes the `Fetch` step to a point read instead of a query (see §4.2). @@ -805,7 +861,7 @@ ReadMany is the initial target. Its execution: ### 6.5 Backpressure & Cancellation -- **Caller drops the `FeedPager`**: In-flight `execute_operation_pipeline` futures are +- **Caller drops the future**: In-flight `execute_single_operation` futures are cancelled via standard Rust drop semantics. The executor does not buffer results beyond what is needed for the current turn. - **Memory bounds**: The executor does not buffer more than `concurrency_limit` concurrent @@ -819,7 +875,21 @@ ReadMany is the initial target. Its execution: ## 7. Continuation Tokens -### 7.1 Token Structure +### 7.1 Design Principle: O(1) Token Size + +A container may have many physical partitions. Storing per-range continuation state +for every partition would make the token size linear in partition count — unacceptable for +tokens that must cross HTTP request boundaries (e.g., sent to a browser in a URL or header). + +Instead, the continuation token stores the state of **exactly one partition key range** — the +range where execution last yielded results. On resume, the Planner reconstructs the positions +of all other partitions using **query filter rewriting** rather than stored server tokens. + +This follows the same pattern as the Java Cosmos SDK, which exploits the fact that Cosmos DB +data has a composite sort order `(query_sort_order, partition_key_range_id)` to generate +efficient range filters for partitions that don't have stored continuation tokens. + +### 7.2 Token Structure ```rust /// A typed continuation token for resuming a feed operation. @@ -827,6 +897,16 @@ ReadMany is the initial target. Its execution: /// Opaque to callers. Serializes to a string via `Display` and /// deserializes via `FromStr`. The internal representation is /// versioned and validated on deserialization. +/// +/// The token mirrors the plan's step graph as a **nested** structure: +/// each pipeline stage wraps the continuation state of its children. +/// This means each layer can interpret its children's state in context +/// — for example, an `OrderBy` node knows how to generate filters for +/// the `Fetch` nodes it wraps, without the Fetch nodes needing to be +/// aware of ORDER BY semantics. +/// +/// On resume, the Planner walks the nested token top-down, matching +/// each layer to the corresponding step in the re-created plan. #[derive(Clone, Debug)] pub struct ContinuationToken { inner: ContinuationTokenInner, @@ -839,42 +919,189 @@ struct ContinuationTokenInner { version: u32, /// Container identity (RID, not name) to detect container recreation. + #[serde(rename = "containerRid")] container_rid: String, - /// The operation kind this token is valid for. - operation_kind: ContinuationOperationKind, - - /// Per-partition-range state. - ranges: Vec, + /// The nested resume state, rooted at the plan's output step. + /// Each layer wraps the state of its child steps. + resume: ResumeState, } +/// Nested resume state for a plan step. +/// +/// Each variant captures the state for one pipeline stage and embeds +/// its children's state. This forms a tree that mirrors the plan DAG. +/// New variants can be added as new pipeline stages are introduced. #[derive(Clone, Debug, Serialize, Deserialize)] -struct RangeContinuation { - /// The EPK bounds of this range (stable across splits). - min_inclusive_epk: String, - max_exclusive_epk: String, - - /// The PK range ID at the time the continuation was created. - /// Used as a hint for fast resolution; falls back to EPK bounds - /// if the range has split. - pk_range_id: String, - - /// Server-provided continuation token for this range. - /// `None` means this range is completed. - server_continuation: Option, +#[serde(tag = "type")] +enum ResumeState { + /// A single partition fetch, mid-stream or just completed. + /// This is a leaf node — it has no children. + #[serde(rename = "fetch")] + Fetch { + /// EPK min inclusive of the target range. + min: String, + + /// EPK max exclusive of the target range. + max: String, + + /// Server-provided continuation token for this range. + /// Absent when this range was just completed and the cursor + /// is at the boundary to the next range. + #[serde(rename = "serverToken", skip_serializing_if = "Option::is_none")] + server_continuation: Option, + }, + + /// An unordered (sequential-drain) merge over partitions. + /// Wraps the child `Fetch` that was active when the token was created. + /// On resume, partitions left of the child are skipped, the child + /// resumes from its state, and partitions to the right start fresh. + #[serde(rename = "drain")] + Drain { + /// The resume state of the active child Fetch step. + inner: Box, + }, + + /// An ordered (k-way merge) over partitions. + /// Wraps the child `Fetch` that last produced results, plus the + /// ORDER BY values needed to generate filters for all other partitions. + #[serde(rename = "orderBy")] + OrderBy { + /// The ORDER BY values of the last document returned. + /// Used to generate range filters for non-target partitions. + #[serde(rename = "lastValues")] + last_order_by_values: Vec, + + /// The `_rid` of the last document returned. + /// Used for duplicate elimination on the target partition. + #[serde(rename = "lastRid")] + last_rid: String, + + /// Whether to include documents matching the last ORDER BY values. + inclusive: bool, + + /// The resume state of the target child Fetch step. + inner: Box, + }, + + // Future variants: + // + // /// An offset/limit stage wrapping an inner pipeline. + // #[serde(rename = "offsetLimit")] + // OffsetLimit { + // skipped: u64, + // returned: u64, + // inner: Box, + // }, } +``` -#[derive(Clone, Debug, Serialize, Deserialize)] -enum ContinuationOperationKind { - Query, - ReadFeed, - // Future: ChangeFeed, etc. +The nesting means each layer owns the interpretation of its children. An `OrderBy` node +knows the `Fetch` inside it is the target partition, and uses `lastValues`/`lastRid` to +generate filters for the other partitions. A `Drain` node knows the `Fetch` inside it is +the cursor position, and partitions left/right of it are skipped/fresh. Neither the `Fetch` +node nor the Planner need to cross-reference sibling state. + +#### Wire-format field reference + +| Rust type | Field | Wire key | Content | +|-----------|-------|----------|---------| +| `ContinuationTokenInner` | `version` | `version` | Format version (integer) | +| | `container_rid` | `containerRid` | Container RID (string) | +| | `resume` | `resume` | Nested `ResumeState` (root of tree) | +| `ResumeState::Fetch` | *(tag)* | `type` | `"fetch"` | +| | `min` | `min` | EPK min inclusive (hex string) | +| | `max` | `max` | EPK max exclusive (hex string) | +| | `server_continuation` | `serverToken` | Server continuation (omitted if null) | +| `ResumeState::Drain` | *(tag)* | `type` | `"drain"` | +| | `inner` | `inner` | Child `ResumeState` | +| `ResumeState::OrderBy` | *(tag)* | `type` | `"orderBy"` | +| | `last_order_by_values` | `lastValues` | Last ORDER BY values (array) | +| | `last_rid` | `lastRid` | Last document `_rid` (string) | +| | `inclusive` | `inclusive` | Include matching values (bool) | +| | `inner` | `inner` | Child `ResumeState` | + +### 7.3 Resume Strategy + +On resume, the Planner walks the nested `ResumeState` tree top-down, matching each layer to +the corresponding step in the re-created plan. Each layer interprets its own state and its +child's state in context: + +#### `Drain` (unordered cross-partition) + +The `Drain` node wraps a `Fetch` child representing the cursor position. On resume: + +| Partition position | Action | +|--------------------|--------| +| **Left of child** (EPK max ≤ child's min) | Skip — already drained. | +| **Child range** (matches child's EPK bounds) | Resume using child's `serverToken`. | +| **Right of child** (EPK min ≥ child's max) | Start fresh (not yet visited). | + +If the child's range has split, `PartitionMapper` uses the EPK bounds to assign the server +continuation to the appropriate child range(s). + +#### `OrderBy` (ordered cross-partition) + +The `OrderBy` node wraps a `Fetch` child (the target partition) and carries `lastValues` / +`lastRid` for filter generation. On resume: + +| Partition position | Generated filter | Rationale | +|--------------------|-----------------|-----------| +| **Left of child** | ORDER BY values **strictly past** `lastValues` | May have remaining items, but only those after the resume point. | +| **Child range** | Server continuation + ORDER BY values **at or past** `lastValues` | Resume exactly where we stopped. | +| **Right of child** | ORDER BY values **at or past** `lastValues` | Haven't fully explored these yet. | + +Duplicate elimination: on the child partition, documents with the same ORDER BY values as +`lastValues` but `_rid ≤ lastRid` have already been returned and are filtered out. + +#### `Fetch` (leaf — single partition) + +A bare `Fetch` at the root (no wrapping `Drain` or `OrderBy`) represents a single-partition +operation. Resume uses `serverToken` directly. + +#### Nesting composes naturally + +Future pipeline stages wrap their children the same way: + +```text +OffsetLimit { skipped: 50, returned: 20, + inner: OrderBy { lastValues: ["Baker"], lastRid: "abc", inclusive: true, + inner: Fetch { min: "55", max: "AA", serverToken: "..." } + } } ``` -### 7.2 Serialization +Each layer reads only its own fields plus `inner`. No layer needs to inspect sibling or +grandchild state. + +#### Mapping `ResumeState` back to `PlanStep` + +The `ResumeState` tree does not map 1:1 to `PlanStep` variants — it maps to the **Planner's +reconstruction logic**: + +| `ResumeState` | Effect on plan | +|---------------|----------------| +| `Fetch` | Sets `PlanStep::Fetch.continuation` to the stored `serverToken`. The EPK bounds identify which `Fetch` step in the plan to target. | +| `Drain` | The Planner uses the child `Fetch`'s EPK bounds to determine which partition was active, skips partitions left of it, and starts right partitions fresh. The `UnorderedMerge` step itself is stateless. | +| `OrderBy` | The Planner generates a `resume_filter` string from `lastValues` and sets it on each `Fetch` step. The child `Fetch`'s `continuation` is also restored. Duplicate elimination state (`lastRid`, `inclusive`) is applied at the executor level. | + +**Filter injection for ORDER BY queries:** The backend query plan provides a rewritten query +containing the `{documentdb-formattableorderbyquery-filter}` placeholder token. The `Fetch` +step's `operation` holds this **unrewritten** query text. At execution time, the executor +replaces the placeholder with the `resume_filter` via simple string substitution. This means: + +- On **first page** (no continuation): the placeholder is replaced with `"true"` (no filter). +- On **resume**: the Planner computes the filter expression from the `OrderBy` resume state + (e.g., `"c.name > 'Baker'"` for left-of-target partitions, `"c.name >= 'Baker'"` for the + target and right-of-target) and sets it as `resume_filter` on each `Fetch` step. + +This approach keeps the `Fetch` step generic — it doesn't need to understand ORDER BY +semantics, just string substitution on a known placeholder. + +### 7.4 Serialization -`ContinuationToken` implements `Display` and `FromStr`. The wire format is base64-encoded JSON: +`ContinuationToken` implements `Display` and `FromStr`. The wire format is base64url-encoded +JSON (using the URL-safe alphabet with no padding): ```rust impl Display for ContinuationToken { @@ -906,15 +1133,131 @@ impl FromStr for ContinuationToken { } ``` -### 7.3 Compatibility Contract +#### Sample Tokens + +**Unordered cross-partition query, mid-stream on partition ["55","AA")** + +A `Drain` wraps the active `Fetch`: + +JSON (before base64 encoding): +```json +{ + "version": 2, + "containerRid": "dbs/abc/colls/def", + "resume": { + "type": "drain", + "inner": { + "type": "fetch", + "min": "55", + "max": "AA", + "serverToken": "+RID:~abc123#RT:1#TRC:10#ISV:2#IEO:65551" + } + } +} +``` + +On resume, the `Drain` sees its child targets `["55","AA")`. Partitions left of `"55"` are +skipped, the target resumes from `serverToken`, and partitions right of `"AA"` start fresh. + +**Unordered query, target partition just completed (cursor at boundary)** + +```json +{ + "version": 2, + "containerRid": "dbs/abc/colls/def", + "resume": { + "type": "drain", + "inner": { + "type": "fetch", + "min": "55", + "max": "AA" + } + } +} +``` + +`serverToken` is absent, meaning partition `["55","AA")` is fully drained. The Planner +skips everything up to and including this range, and starts the next partition fresh. + +**Single-partition query, mid-stream** + +A bare `Fetch` at the root (no wrapping layer): + +```json +{ + "version": 2, + "containerRid": "dbs/abc/colls/def", + "resume": { + "type": "fetch", + "min": "55", + "max": "AA", + "serverToken": "-RID:QmFzZTY0#RT:3#TRC:50" + } +} +``` + +**ORDER BY cross-partition query, `ORDER BY c.name ASC`** + +An `OrderBy` wraps the target `Fetch`, carrying the last returned document's sort values: + +```json +{ + "version": 2, + "containerRid": "dbs/abc/colls/def", + "resume": { + "type": "orderBy", + "lastValues": ["Baker"], + "lastRid": "R3JlYXQ", + "inclusive": true, + "inner": { + "type": "fetch", + "min": "55", + "max": "AA", + "serverToken": "+RID:~abc456#RT:2#TRC:5#ISV:2#IEO:65551" + } + } +} +``` + +On resume, the `OrderBy` layer generates partition filters from `lastValues`: +- Partitions left of `"55"`: filter `c.name > 'Baker'` (strictly past). +- Target `["55","AA")`: resume from `serverToken`, filter `c.name >= 'Baker'`, + deduplicate items with `_rid ≤ "R3JlYXQ"`. +- Partitions right of `"AA"`: filter `c.name >= 'Baker'`. + +**Compound ORDER BY, `ORDER BY c.name ASC, c.age DESC`** + +```json +{ + "version": 2, + "containerRid": "dbs/abc/colls/def", + "resume": { + "type": "orderBy", + "lastValues": ["Baker", 42], + "lastRid": "UmVzdW1l", + "inclusive": true, + "inner": { + "type": "fetch", + "min": "AA", + "max": "FF", + "serverToken": "+RID:~abc789#RT:1#TRC:3#ISV:2" + } + } +} +``` + +The `lastValues` array contains one entry per ORDER BY column, in declaration order. + +### 7.5 Compatibility Contract A continuation token is **invalidated** by: -1. **Container recreation** — The token's `container_rid` won't match the new container's RID. +1. **Container recreation** — The token's `containerRid` won't match the new container's RID. 2. **Token version mismatch** — A token produced by a newer SDK version may not be readable by an older version. -3. **Operation kind mismatch** — Resuming a `Query` continuation for a `ReadFeed` operation - is rejected. +3. **Structure mismatch** — If the re-created plan produces a different step graph shape + than the token's nested `ResumeState` (e.g., the operation changed, or the plan type + differs), the token is rejected. A continuation token **survives**: @@ -925,14 +1268,18 @@ A continuation token **survives**: 3. **Process boundaries** — The token is a self-contained string, safe to send to a browser and back. -### 7.4 What the Token Does NOT Encode +### 7.6 What the Token Does NOT Encode +- **Per-range state for all partitions** — Only the active Fetch step's state is stored. + Other partitions' positions are reconstructed via query filter rewriting on resume. - **Query text or parameters** — The caller must provide an equivalent `CosmosOperation`. - **Session tokens** — Session consistency is not preserved across process boundaries via the continuation token. The driver resolves session tokens from the `SessionManager` cache for each turn independently. - **Container name or database name** — Only the RID is stored. The caller provides routing context via the `CosmosOperation`. +- **PK range IDs** — Only EPK bounds are stored, which are stable across partition splits. + PK range IDs are resolved dynamically from the `PartitionKeyRangeCache` on resume. --- @@ -1028,91 +1375,97 @@ API or option. ### 10.1 Public API +The driver exposes a single `execute_operation` method for **all** operations — both point +and feed. The driver is stateless across calls: each invocation runs one turn of the plan +and returns a `CosmosResponse`. The response optionally includes a continuation token when +more pages are available. The higher-level SDK (e.g., `azure_data_cosmos`) decides which +operations to surface as pagers from a UX perspective. + ```rust impl CosmosDriver { - /// Executes a point operation (read, write, delete). + /// Executes a Cosmos DB operation (point or feed). /// - /// Internally, this creates a trivial single-step plan and executes it. - /// The overhead is negligible compared to the HTTP round-trip. + /// For point operations (read, create, delete, etc.), this returns the + /// single response with no continuation token. + /// + /// For feed operations (query, read-many, read-all), this executes one + /// turn of the plan and returns a page of results. If more pages are + /// available, the response includes a `ContinuationToken`. The caller + /// passes this token back in `OperationOptions` to fetch the next page. + /// + /// The driver does not manage pagination state — it acts as a stateless + /// service. The SDK layer is responsible for threading continuation tokens + /// across calls to implement pagers/streams. pub async fn execute_operation( &self, operation: CosmosOperation, options: OperationOptions, ) -> azure_core::Result { - // Plan → Execute → drain single page → return CosmosResponse - } - - /// Executes a feed operation (query, read-many, read-all). - /// - /// Returns a `FeedPager` that yields pages of results. The caller - /// pulls pages by calling `next_page()`. Each page includes a - /// continuation token for resumption. - pub async fn execute_feed_operation( - &self, - operation: CosmosOperation, - options: FeedOperationOptions, - ) -> azure_core::Result { - // Plan → wrap executor in FeedPager → return + // Plan → Execute one turn → return CosmosResponse } } ``` -### 10.2 FeedPager +### 10.2 CosmosResponse Changes -`FeedPager` is the public-facing page iterator. It wraps the internal `PlanExecutor` and -provides a stable API that does not expose plan/executor internals. +`CosmosResponse` gains an optional continuation token: ```rust -/// An iterator over pages of feed operation results. -/// -/// Created by [`CosmosDriver::execute_feed_operation`]. Yields pages -/// of results until the operation is complete. -/// -/// Dropping the `FeedPager` cancels any in-flight requests. -pub struct FeedPager { - executor: PlanExecutor, -} +#[non_exhaustive] +pub struct CosmosResponse { + /// Raw response body (UTF-8 JSON or Cosmos binary encoding). + body: Vec, -impl FeedPager { - /// Fetches the next page of results. - /// - /// Returns `Ok(None)` when no more pages are available. - pub async fn next_page(&mut self) -> azure_core::Result> { - self.executor.next_turn(/* ... */).await - } + /// Extracted Cosmos-specific headers. + headers: CosmosResponseHeaders, - /// Returns the continuation token for the current position. - /// - /// The token can be serialized to a string and used to resume the - /// operation later by passing it to `FeedOperationOptions::with_continuation`. + /// Operation status including HTTP status code and optional sub-status. + status: CosmosStatus, + + /// Full diagnostics context for this operation. + diagnostics: Arc, + + /// Continuation token for feed operations. + /// Present when more pages are available; absent for point operations + /// and when the feed is fully drained. + continuation_token: Option, +} + +impl CosmosResponse { + /// Returns the continuation token, if more pages are available. /// - /// Returns `None` if the operation is complete or does not support - /// pagination (e.g., ReadMany). - pub fn continuation_token(&self) -> Option { - self.executor.continuation_token() + /// For point operations, this always returns `None`. + /// For feed operations, `None` means the operation is complete. + pub fn continuation_token(&self) -> Option<&ContinuationToken> { + self.continuation_token.as_ref() } } ``` -### 10.3 FeedResponsePage +### 10.3 OperationOptions Changes + +`OperationOptions` gains feed-specific fields for continuation and concurrency: ```rust -/// A single page of results from a feed operation. -/// -/// Contains raw response bytes and metadata. The higher-level SDK -/// handles deserialization into typed items. -pub struct FeedResponsePage { - /// Raw response body (the items array as JSON bytes). - body: Vec, +pub struct OperationOptions { + // ... existing fields (retry, timeout, consistency, etc.) ... - /// Cosmos-specific response headers (RU charge, session token, etc.). - headers: CosmosResponseHeaders, + /// Maximum number of items per page (feed operations only). + /// If not set, the server default applies. + max_item_count: Option, - /// Diagnostics for this page (may aggregate multiple sub-request diagnostics). - diagnostics: Arc, + /// Maximum number of concurrent partition key range fetches + /// (feed operations only). Default: min(num_pk_ranges, 10). + max_concurrency: Option, + + /// Continuation token for resuming a previous feed operation. + /// Pass the token from a previous `CosmosResponse::continuation_token()`. + continuation: Option, } ``` +These fields are ignored for point operations. + ### 10.4 Ordering Guarantees | Operation | Order Guarantee | @@ -1128,7 +1481,7 @@ pub struct FeedResponsePage { Page boundaries are determined by: - **Server-side max item count**: The server may return fewer items than requested. -- **Client-side max item count**: Configurable via `FeedOperationOptions::max_item_count`. +- **Client-side max item count**: Configurable via `OperationOptions::max_item_count`. - **Server continuation**: A page boundary occurs whenever the server returns a continuation token. @@ -1139,39 +1492,17 @@ server-side pages were consumed internally. ## 11. Configuration Surface -### 11.1 FeedOperationOptions - -```rust -/// Options specific to feed operations. -/// -/// Extends `OperationOptions` with feed-specific settings. -pub struct FeedOperationOptions { - /// Base operation options (retry, timeout, consistency, etc.). - base: OperationOptions, - - /// Maximum number of items per page. - /// If not set, the server default applies. - max_item_count: Option, - - /// Maximum number of concurrent partition key range fetches. - /// Default: min(num_pk_ranges, 10). - max_concurrency: Option, - - /// Continuation token for resuming a previous operation. - continuation: Option, -} -``` - -### 11.2 Layered Options Resolution +### 11.1 OperationOptions Additions -Feed operation options follow the same layered resolution as existing operation options: +Feed-specific options are added to `OperationOptions` (see §10.3). They are ignored for +point operations. The existing layered resolution applies: -1. `FeedOperationOptions` (per-call) +1. `OperationOptions` (per-call) 2. `DriverOptions` (per-driver) 3. `CosmosDriverRuntime` (global) 4. Environment variables -The `max_concurrency` and `max_item_count` fields follow the same precedence. +The `max_concurrency`, `max_item_count`, and `continuation` fields follow the same precedence. --- @@ -1184,12 +1515,12 @@ The plan model MUST NOT regress point operation performance. Requirements: - **No heap allocation** for trivial plans beyond what `execute_operation` does today. - **No additional async machinery** (no spawning, no channels) for single-step plans. - **Benchmark**: Point operation latency with the plan model must be within 1% of the - current direct `execute_operation_pipeline` call. + current direct `execute_single_operation` call. -Implementation: The `execute_operation` method detects trivial operations (based on -`OperationType` and `OperationTarget`) and calls `execute_operation_pipeline` directly, -bypassing the Planner/Executor entirely. The plan model is only instantiated for feed -operations. +Implementation: For point operations and single-partition feeds, the Planner produces an +`OperationPlan::Trivial` — a stack-allocated single step with no `Vec` overhead. The +executor matches on `Trivial` and calls `execute_single_operation` directly with no +graph traversal. The plan model is only heap-allocated for multi-step fan-out operations. ### 12.2 Fan-Out Memory Bounds @@ -1220,7 +1551,7 @@ For paginated queries: ### Phase 2: Plan Infrastructure -1. Implement `OperationPlan`, `PlanStep`, `StepId`. +1. Implement `OperationPlan`, `PlanStep`. 2. Implement `Planner` with trivial single-step planning (point ops only). 3. Implement `PlanExecutor` for single-step plans. 4. Wire `execute_operation` through Plan → Execute path (with fast-path bypass). @@ -1232,8 +1563,8 @@ For paginated queries: - Group items by PK range (via `PartitionKeyRangeCache`). - Create fan-out `Fetch` steps + `UnorderedMerge` step. 2. Implement `UnorderedMerge` step execution in `PlanExecutor`. -3. Implement `FeedPager` and `FeedResponsePage`. -4. Add `execute_feed_operation` to `CosmosDriver`. +3. Wire `execute_operation` to use Plan → Execute for feed operations. +4. Extend `CosmosResponse` with optional `continuation_token` field. 5. Integration tests with partition splits. ### Phase 4: Single-Partition Queries @@ -1270,8 +1601,12 @@ For paginated queries: | PlanExecutor — concurrency | Verify concurrency cap is respected (at most N concurrent fetches). | | ContinuationToken — round-trip | Serialize to string, deserialize back, verify equality. | | ContinuationToken — version compat | Older version tokens deserialize correctly. | -| ContinuationToken — split recovery | Token with stale PK range ID maps to new ranges via EPK bounds. | -| OperationTarget — mutual exclusivity | Verify builder rejects invalid combinations. | +| ContinuationToken — split recovery | Token with EPK bounds spanning a split range maps to correct child ranges. | +| ContinuationToken — O(1) size | Token size is constant regardless of partition count (only one Fetch leaf stored). | +| ContinuationToken — Drain resume | Drain node correctly classifies partitions as left/target/right from nested Fetch. | +| ContinuationToken — OrderBy resume | OrderBy node generates correct range filters and dedup state from nested Fetch + lastValues. | +| ContinuationToken — nesting | Nested tokens (e.g., future OffsetLimit wrapping OrderBy wrapping Fetch) round-trip correctly. | +| OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `EpkRange` produce correct targets. | ### 14.2 Integration Tests @@ -1283,7 +1618,7 @@ For paginated queries: | ReadMany — partition split | Trigger split during ReadMany, verify re-plan and completion. | | ReadMany — large set | Read 1000 items, verify server-side pagination within each range works. | | Query — single partition | Execute paginated query, verify continuation threading. | -| Query — resume | Execute query, get continuation, resume in new FeedPager, verify continues. | +| Query — resume | Execute query, get continuation, pass token back in next call, verify continues. | | Diagnostics | Verify RU charges are aggregated across fan-out steps. | | Throughput control | Verify fan-out respects throughput control group limits. | @@ -1291,7 +1626,7 @@ For paginated queries: | Test Area | Metric | |-----------|--------| -| Point op overhead | Latency regression < 1% vs. direct `execute_operation_pipeline`. | +| Point op overhead | Latency regression < 1% vs. direct `execute_single_operation`. | | ReadMany fan-out | Latency scales sub-linearly with partition count (concurrency works). | | Memory bounds | Peak memory for ReadMany of N items is O(N × item_size). | @@ -1308,7 +1643,7 @@ The change feed is a specialized feed operation with unique characteristics: - Incremental mode vs. full-fidelity mode. The current spec reserves extension points in `OperationPayload`, `OperationTarget`, -`ContinuationOperationKind`, and `PlanStep` for change feed support. +`PlanStep`, and `ResumeState` for change feed support. ### 15.2 ORDER BY Merge-Sort From beaeb5ff118c065e7c9d662f44e08572bb83d35a Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Wed, 22 Apr 2026 22:22:23 +0000 Subject: [PATCH 03/29] Replace OpenTelemetry section with diagnostics hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces §8 with Turn/Step/Request diagnostics structure. The driver provides structured timing and concurrency data via raw timestamps; the SDK computes derived values and creates OTEL spans. Introduces TurnDiagnostics and StepDiagnostics with enqueued/started/completed timestamps for concurrency observation. Computed fields (wait time, execution time, max concurrent steps) are omitted to minimize memory. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../docs/FEED_OPERATIONS_SPEC.md | 264 +++++++++++++++--- 1 file changed, 225 insertions(+), 39 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 46a0fe83fb0..0f6dd234cc1 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -16,7 +16,7 @@ 5. [Planner](#5-planner) 6. [Plan Executor](#6-plan-executor) 7. [Continuation Tokens](#7-continuation-tokens) -8. [OpenTelemetry Integration](#8-opentelemetry-integration) +8. [Diagnostics Structure](#8-diagnostics-structure) 9. [Error Handling & Partition Splits](#9-error-handling--partition-splits) 10. [API Semantics & Invariants](#10-api-semantics--invariants) 11. [Configuration Surface](#11-configuration-surface) @@ -134,7 +134,7 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i │ │ ├─ Handle partition splits (re-plan affected ranges) │ │ │ │ ├─ Enforce concurrency caps for fan-out │ │ │ │ ├─ Integrate with throughput control │ │ -│ │ ├─ Emit OpenTelemetry spans │ │ +│ │ ├─ Collect step-level diagnostics (timing, concurrency) │ │ │ │ └─ Produce continuation token in response (if more pages remain) │ │ │ └──────────────────────────────────────────────────────────────────────────┘ │ │ │ │ @@ -784,8 +784,8 @@ pub(crate) struct PlanExecutor { step_states: Vec, /// Concurrency control for fan-out. concurrency_limit: usize, - /// OpenTelemetry context for span linking. - trace_context: FeedTraceContext, + /// Diagnostics builder for collecting step-level timing. + diagnostics: DiagnosticsContextBuilder, } impl PlanExecutor { @@ -807,7 +807,7 @@ impl PlanExecutor { Each call to `execute`: -1. **Emit OpenTelemetry span** for this turn (child of the feed operation span, linked to root). +1. **Record step enqueue** — mark each step as enqueued for concurrency tracking. 2. **Identify runnable steps** — steps whose dependencies are satisfied. 3. **Execute runnable steps concurrently** (up to concurrency cap), each via `execute_single_operation`. @@ -1283,49 +1283,234 @@ A continuation token **survives**: --- -## 8. OpenTelemetry Integration +## 8. Diagnostics Structure -### 8.1 Span Hierarchy +### 8.1 Design Principle -Feed operations produce the following span structure: +The driver does **not** create OpenTelemetry spans or any other telemetry artifacts. Instead, +each call to `execute_operation` returns a `DiagnosticsContext` on the `CosmosResponse` +containing a structured hierarchy of timing, concurrency, and request data. The higher-level +SDK crate uses this data to create OTEL spans, log entries, or any other telemetry it chooses. + +This separation ensures the driver remains transport- and telemetry-agnostic while providing +enough detail for the SDK to reconstruct the full execution timeline. + +### 8.2 Hierarchy: Turn → Step → Request + +Each `execute_operation` call executes one **Turn** of an operation plan. A Turn contains +one or more **Steps** (one per plan step executed), and each Step contains zero or more +**Requests** (the existing `RequestDiagnostics` type, unchanged). ```text -Feed Operation Span (root) - ├── db.cosmosdb.operation = "query_items" (or "read_many", etc.) - ├── db.cosmosdb.container = "my-container" - ├── db.cosmosdb.feed_operation_id = - │ - ├── Turn 0 Span - │ ├── db.cosmosdb.feed_turn_index = 0 - │ ├── [linked to Feed Operation Span] - │ │ - │ ├── PK Range "0" Fetch Span - │ │ └── (transport pipeline spans) - │ ├── PK Range "1" Fetch Span - │ │ └── (transport pipeline spans) - │ └── UnorderedMerge Span - │ - ├── Turn 1 Span (if paginated) - │ ├── db.cosmosdb.feed_turn_index = 1 - │ └── ... - ... +DiagnosticsContext + └── TurnDiagnostics + ├── duration, total RU, concurrency metadata + │ + ├── StepDiagnostics [0] (e.g., Fetch to PK range "0") + │ ├── enqueued_at, started_at, completed_at + │ ├── step type, EPK range + │ └── RequestDiagnostics [0] (initial attempt) + │ RequestDiagnostics [1] (retry, if any) + │ + ├── StepDiagnostics [1] (e.g., Fetch to PK range "1") + │ ├── enqueued_at, started_at, completed_at + │ └── RequestDiagnostics [0] + │ + └── StepDiagnostics [2] (e.g., UnorderedMerge) + ├── started_at, completed_at + └── (no requests — local computation only) +``` + +For point operations, the Turn has exactly one Step with one or more Requests (retries). +The hierarchy is always present but trivially flat. + +### 8.3 `TurnDiagnostics` + +```rust +/// Diagnostics for a single turn (one page) of an operation. +/// +/// Each call to `execute_operation` produces exactly one `TurnDiagnostics`. +/// For paginated feed operations, the SDK aggregates multiple turns' diagnostics +/// across pages. +pub struct TurnDiagnostics { + /// Wall-clock duration of the entire turn. + duration: Duration, + + /// Total RU charge across all steps and requests in this turn. + total_request_charge: RequestCharge, + + /// Per-step diagnostics, in execution order. + steps: Vec, + + /// Concurrency metadata for this turn. + concurrency: TurnConcurrency, +} + +/// Concurrency metadata for a turn. +/// +/// Enables the SDK to observe how steps were parallelized and whether the +/// concurrency cap was a bottleneck. Wait times and max concurrency can +/// be computed from the step timestamps by the SDK if needed. +pub struct TurnConcurrency { + /// Total number of steps executed in this turn. + steps_executed: usize, + + /// The concurrency cap that was configured for this turn. + /// Steps beyond this limit waited for a permit before starting. + concurrency_cap: usize, +} +``` + +### 8.4 `StepDiagnostics` + +```rust +/// Diagnostics for a single step within a turn. +/// +/// Captures three timestamps to distinguish **wait time** (waiting for a +/// concurrency permit) from **execution time** (actually performing the +/// step's work). These durations can be trivially computed by the SDK: +/// +/// ```text +/// enqueued_at started_at completed_at +/// │── wait time ──│── execution time ──│ +/// (started_at - (completed_at - +/// enqueued_at) started_at) +/// ``` +/// +/// For steps that don't go through the concurrency semaphore (e.g., Merge), +/// `enqueued_at == started_at` (zero wait time). +pub struct StepDiagnostics { + /// What kind of step this was. + step_type: StepType, + + /// The EPK range targeted by this step (for Fetch steps). + /// `None` for non-fetch steps (Merge, etc.). + epk_range: Option>, + + /// When the step was enqueued for execution (requested a concurrency permit). + enqueued_at: Instant, + + /// When the step started executing (acquired its concurrency permit). + started_at: Instant, + + /// When the step completed. + completed_at: Instant, + + /// Total RU charge for this step. + request_charge: RequestCharge, + + /// Individual HTTP request diagnostics for this step. + /// Empty for non-HTTP steps (e.g., Merge). + /// May contain multiple entries due to retries within the step. + requests: Vec, +} + +/// Identifies the kind of plan step for diagnostics purposes. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum StepType { + /// A Fetch step that executed an HTTP request via execute_single_operation. + Fetch, + /// An UnorderedMerge step that concatenated results from upstream steps. + UnorderedMerge, + // Future: OrderedMerge, OffsetLimit, etc. +} +``` + +### 8.5 Collection Approach + +The `PlanExecutor` records timestamps at key points during execution: + +1. **Step enqueued** (`enqueued_at`): Recorded when the executor submits a step for + execution. For concurrent fan-out, this is when the step requests a permit from the + concurrency semaphore. + +2. **Step started** (`started_at`): Recorded when the step acquires its concurrency permit + and begins executing. For steps that don't use the semaphore (single-step plans, Merge + steps), this equals `enqueued_at`. + +3. **Step completed** (`completed_at`): Recorded when the step finishes (successfully or + with an error). For Fetch steps, this is after `execute_single_operation` returns + (including any retries it performs internally). + +4. **Derived values**: The SDK can compute wait time (`started_at - enqueued_at`), + execution time (`completed_at - started_at`), max concurrent steps (from overlapping + intervals), and total wait time (sum across steps) from the raw timestamps. The driver + stores only the timestamps to minimize memory. + +All timestamps use `Instant::now()` — cheap and monotonic. No allocations beyond the +`Vec` that is already needed for the diagnostics output. No derived +`Duration` fields are stored — the SDK computes them on demand. + +### 8.6 `DiagnosticsContext` Changes + +The existing `DiagnosticsContext` gains a `TurnDiagnostics` field. The flat +`requests: Arc>` is replaced by the nested structure, but a +backward-compatible `requests()` accessor is preserved by flattening the tree: + +```rust +impl DiagnosticsContext { + /// Returns the turn diagnostics for this operation. + pub fn turn(&self) -> &TurnDiagnostics { ... } + + /// Returns all HTTP request diagnostics, flattened across steps. + /// + /// This is backward-compatible with the pre-feed-operations API. + /// Requests are returned in the order they were executed. + pub fn requests(&self) -> Arc> { + // Flatten: turn.steps.iter().flat_map(|s| s.requests.iter()) + } +} ``` -### 8.2 Cross-Process Span Linking +The `DiagnosticsContextBuilder` gains step-tracking methods: + +```rust +impl DiagnosticsContextBuilder { + /// Records that a step has been enqueued for execution. + pub(crate) fn enqueue_step(&mut self, step_type: StepType) -> StepHandle { ... } + + /// Records that a step has started executing (acquired concurrency permit). + pub(crate) fn start_step(&mut self, handle: &StepHandle) { ... } + + /// Records that a step has completed, with its requests. + pub(crate) fn complete_step( + &mut self, + handle: StepHandle, + requests: Vec, + ) { ... } +} +``` + +### 8.7 Granularity Control + +The existing `DiagnosticsVerbosity` enum (Summary / Detailed) controls how the Turn/Step +tree is serialized: + +| Verbosity | Behavior | +|-----------|----------| +| **Summary** | Step-level timing is included but per-step wait times may be omitted. Individual `RequestDiagnostics` are deduplicated/aggregated as they are today. Concurrency metadata is included (it's a few integers). | +| **Detailed** | Full tree: all step timestamps (enqueued/started/completed), all individual `RequestDiagnostics` with events, and concurrency metadata. | + +Point operations produce the same output as today at both verbosity levels — the Turn/Step +nesting is transparent when there's only one step. + +### 8.8 Pagination Context -When a feed operation is resumed from a continuation token in a different process: +Each `execute_operation` call produces one `DiagnosticsContext` containing one Turn. The +SDK layer manages pagination and can: -1. The original Feed Operation Span is NOT re-opened (it may have ended). -2. A new Feed Operation Span is created in the new process. -3. The continuation token carries the `feed_operation_id` (a UUID). -4. Each Turn Span in the new process includes a **span link** to the original - feed operation ID, enabling distributed tracing tools to connect the turns - across process boundaries. +1. **Aggregate Turns** — collect `TurnDiagnostics` from multiple pages to produce a + summary of the full pagination operation (total RU, total duration, pages fetched). -### 8.3 Point Operation Spans +2. **Correlate across pages** — the continuation token can optionally carry a + `feed_operation_id` (UUID) so the SDK can link diagnostics from different + `execute_operation` calls that belong to the same logical feed operation. -Point operations continue to produce a single span as they do today. The plan/executor layer -does not add additional span nesting for trivial single-step plans. +3. **Create OTEL spans** — the SDK can create a parent span for the feed operation, + child spans for each Turn, and nested spans for each Step, using the timestamps + and metadata from the diagnostics tree. The driver does not prescribe span structure — + it provides the data. --- @@ -1599,7 +1784,8 @@ For paginated queries: | PlanExecutor — single step | Execute trivial plan, verify result matches direct pipeline call. | | PlanExecutor — fan-out | Execute multi-step plan with mock pipeline, verify merge. | | PlanExecutor — concurrency | Verify concurrency cap is respected (at most N concurrent fetches). | -| ContinuationToken — round-trip | Serialize to string, deserialize back, verify equality. | +| ContinuationToken — serialize | Serialize to string, verify output. | +| ContinuationToken — deserialize | Deserialize from explicit string, verify result. | | ContinuationToken — version compat | Older version tokens deserialize correctly. | | ContinuationToken — split recovery | Token with EPK bounds spanning a split range maps to correct child ranges. | | ContinuationToken — O(1) size | Token size is constant regardless of partition count (only one Fetch leaf stored). | From 561d7718616fbd03dbfe720ad9521dd4fed98de9 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Wed, 22 Apr 2026 23:20:24 +0000 Subject: [PATCH 04/29] Refine feed operations spec from review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address findings from 9 parallel review agents: - Fix stale execute_operation_pipeline reference in §5.3 diagram - Consolidate resume strategy: §5.5 now summarizes and points to §7.3 - Add backend query plan caching as §2 open issue - Note EffectivePartitionKey needs Ord for EpkRange (§3.2) - Clarify phasing: spec complete at ReadMany; cross-partition query details are forward-looking and not locked - Use Arc in PlanStep::Fetch for fan-out sharing - Add wall_clock_start/start_instant to TurnDiagnostics for OTEL - Add StepOutcome enum to StepDiagnostics for non-HTTP step errors - Minor grammar and code fence fixes Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../docs/FEED_OPERATIONS_SPEC.md | 108 +++++++++++++----- 1 file changed, 82 insertions(+), 26 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 0f6dd234cc1..59ad4062379 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -87,6 +87,13 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i - Merging results into a single response - Integration with the operation pipeline for each sub-request +This spec is **complete when ReadMany works end-to-end**. Sections covering cross-partition +queries, ORDER BY merge-sort, continuation token resume strategies (`Drain`, `OrderBy`), and +`resume_filter` on `PlanStep::Fetch` are included to validate the architecture's extensibility +— they demonstrate that the core plan/execute model accommodates these future scenarios without +redesign. However, their designs are **forward-looking and not locked**; cross-partition query +execution will be specified and implemented in a separate effort. + --- ## 2. Architectural Overview @@ -184,6 +191,17 @@ account metadata changes, falling back to a full re-plan. This optimization is not required for correctness — the stateless model works correctly today — but should be considered for performance-sensitive workloads with many small pages. +### Open Issue: Backend Query Plan Caching + +For cross-partition queries, the Planner fetches a **backend query plan** from the service +(an HTTP request to get the rewritten query and execution metadata). In the stateless model, +this fetch recurs on every page — a redundant network round trip, since the backend query +plan does not change between pages of the same query. A future optimization should cache +the backend query plan (e.g., on the `ContinuationToken` or via a separate cache keyed by +query text + container RID) so that subsequent pages skip the query plan fetch. This is +orthogonal to the cached `OperationPlan` optimization above: the operation plan depends on +partition key ranges (which may split), but the backend query plan does not. + --- ## 3. CosmosOperation Changes @@ -290,7 +308,13 @@ pub enum OperationTarget { /// the `PartitionKeyRangeCache` at execution time. EpkRange(EpkRange), } +``` + +**Implementation note:** `EpkRange` requires `T: Ord + Clone`. The driver's +`EffectivePartitionKey` type currently does not implement `Ord`. This must be added +(via `derive` or manual implementation) before `OperationTarget::EpkRange` can be used. +```rust impl OperationTarget { /// The full key space: targets all partition key ranges. pub fn all_ranges() -> Self { @@ -434,9 +458,14 @@ pub(crate) enum PlanStep { /// substitution before sending the request. Fetch { /// The operation to execute. Targeted to a specific PK range. + /// Wrapped in `Arc` so that fan-out steps can share the base + /// operation without cloning the full payload (headers, resource + /// reference, etc.). Each `Fetch` step holds a reference to + /// a retargeted `CosmosOperation` — for fan-out, these share + /// the immutable parts of the original operation. /// For ORDER BY queries, the query text contains the /// `{documentdb-formattableorderbyquery-filter}` placeholder. - operation: CosmosOperation, + operation: Arc, /// Options for this fetch. options: OperationOptions, /// Server-provided continuation token for this range, if resuming. @@ -649,8 +678,8 @@ Caller's CosmosOperation Query{rewritten} Query{rewritten} Query{rewritten} │ │ │ ▼ ▼ ▼ - execute_operation execute_operation execute_operation - _pipeline() _pipeline() _pipeline() + execute_single execute_single execute_single + _operation() _operation() _operation() │ │ │ ▼ ▼ ▼ CosmosResponse CosmosResponse CosmosResponse @@ -663,12 +692,15 @@ Caller's CosmosOperation CosmosResponse ``` -Each decomposed `CosmosOperation` is **retargeted** to a specific EPK range. Note that the -query payload may differ from the caller's original SQL: the backend query plan may -**rewrite the query** (e.g., to push down aggregations, add internal projections, or -restructure filters for per-partition execution), and the Planner uses the rewritten query -text in the decomposed operations. The operation pipeline handles region failover, retry, -and auth for each independently. +Each decomposed `CosmosOperation` is **retargeted** to a specific EPK range and wrapped in +`Arc` so that fan-out steps share the immutable parts of the operation (headers, resource +reference, etc.) without cloning. The Planner creates the retargeted operations and wraps +each in an `Arc`; the executor passes `Arc` to `execute_single_operation`. +Note that the query payload may differ from the caller's original SQL: the backend query +plan may **rewrite the query** (e.g., to push down aggregations, add internal projections, +or restructure filters for per-partition execution), and the Planner uses the rewritten +query text in the decomposed operations. The operation pipeline handles region failover, +retry, and auth for each independently. #### Example: ReadMany @@ -752,22 +784,13 @@ goes through the Planner. ### 5.5 Resuming from a Continuation Token -When a `ContinuationToken` is provided, the Planner uses it to reconstruct the plan state: - -1. Validate the token version, container RID, and operation kind compatibility. -2. Resolve the current partition key ranges for the container. -3. Use `PartitionMapper` to classify each range relative to the token's target range: - - **Left of target** — ranges whose EPK max ≤ target's EPK min. These are fully drained - and receive no `Fetch` step (for unordered) or receive a filter-only `Fetch` step - (for ORDER BY, filtering past the last returned ORDER BY values). - - **Target range** — the range overlapping the token's EPK bounds. Resumes using the - stored `server_continuation`. If the range has split, the Planner maps EPK bounds to - the child range(s) and assigns the server continuation appropriately. - - **Right of target** — ranges whose EPK min ≥ target's EPK max. Start fresh with no - continuation (for unordered) or with a filter from the ORDER BY resume state. -4. For `OrderedQuery` tokens, extract the `OrderByResumeState` and generate per-partition - query filters based on the last returned ORDER BY values. Attach duplicate-elimination - state (last `_rid`) for the target range. +When a `ContinuationToken` is provided, the Planner validates it (version, container RID, +operation compatibility), resolves the current partition key ranges, and walks the nested +`ResumeState` tree to reconstruct the plan with the correct per-step state. + +The full resume algorithm — including left/target/right partition classification, filter +generation for ORDER BY, and partition split handling — is described in +[§7.3 Resume Strategy](#73-resume-strategy). --- @@ -1333,6 +1356,21 @@ The hierarchy is always present but trivially flat. /// For paginated feed operations, the SDK aggregates multiple turns' diagnostics /// across pages. pub struct TurnDiagnostics { + /// Wall-clock time when this turn started. + /// + /// Provides an anchor for converting `Instant` timestamps (used in + /// `StepDiagnostics`) to `SystemTime` for OTEL spans or other + /// wall-clock-based telemetry. The SDK can compute a step's wall-clock + /// start as `wall_clock_start + (step.started_at - start_instant)`. + wall_clock_start: SystemTime, + + /// Monotonic timestamp when this turn started. + /// + /// Used as the reference point for computing wall-clock times from + /// step-level `Instant` timestamps: for any step `Instant` value `i`, + /// the wall-clock time is `wall_clock_start + (i - start_instant)`. + start_instant: Instant, + /// Wall-clock duration of the entire turn. duration: Duration, @@ -1403,6 +1441,24 @@ pub struct StepDiagnostics { /// Empty for non-HTTP steps (e.g., Merge). /// May contain multiple entries due to retries within the step. requests: Vec, + + /// Outcome of this step's execution. + /// + /// For Fetch steps, the outcome is typically captured in the + /// `RequestDiagnostics`. This field captures outcomes for non-HTTP + /// steps (e.g., Merge failures) and provides a summary for all + /// step types without requiring callers to inspect nested requests. + outcome: StepOutcome, +} + +/// Outcome of a plan step's execution. +#[derive(Clone, Debug)] +pub enum StepOutcome { + /// The step completed successfully. + Success, + /// The step failed with an error. + /// The message is a brief summary (not a full stack trace). + Failed { message: String }, } /// Identifies the kind of plan step for diagnostics purposes. @@ -1489,7 +1545,7 @@ tree is serialized: | Verbosity | Behavior | |-----------|----------| -| **Summary** | Step-level timing is included but per-step wait times may be omitted. Individual `RequestDiagnostics` are deduplicated/aggregated as they are today. Concurrency metadata is included (it's a few integers). | +| **Summary** | Step-level timing is included but per-step wait times may be omitted. Individual `RequestDiagnostics` are deduplicated/aggregated as they are today. Concurrency metadata is included (a few integers). | | **Detailed** | Full tree: all step timestamps (enqueued/started/completed), all individual `RequestDiagnostics` with events, and concurrency metadata. | Point operations produce the same output as today at both verbosity levels — the Turn/Step From fbe9a0a4fa719f0e249a4ef82c75a77084de89a5 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Wed, 22 Apr 2026 23:39:35 +0000 Subject: [PATCH 05/29] Remove backend query plan caching section The cached operation plan already subsumes the backend query plan, which the Planner consumes during planning and does not need afterward. No separate caching mechanism is needed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../docs/FEED_OPERATIONS_SPEC.md | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 59ad4062379..7b7d42bde04 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -190,17 +190,8 @@ account metadata changes, falling back to a full re-plan. This optimization is not required for correctness — the stateless model works correctly today — but should be considered for performance-sensitive workloads with many small pages. - -### Open Issue: Backend Query Plan Caching - -For cross-partition queries, the Planner fetches a **backend query plan** from the service -(an HTTP request to get the rewritten query and execution metadata). In the stateless model, -this fetch recurs on every page — a redundant network round trip, since the backend query -plan does not change between pages of the same query. A future optimization should cache -the backend query plan (e.g., on the `ContinuationToken` or via a separate cache keyed by -query text + container RID) so that subsequent pages skip the query plan fetch. This is -orthogonal to the cached `OperationPlan` optimization above: the operation plan depends on -partition key ranges (which may split), but the backend query plan does not. +The cached operation plan subsumes the backend query plan (which the Planner consumes during +planning and does not need afterward), so no separate query plan caching is needed. --- From 4349142bc20eadd0946c153cc5de9be983695d01 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 23 Apr 2026 17:03:48 +0000 Subject: [PATCH 06/29] Handle splits within Fetch steps, keeping them focused on the same EPK range --- .../docs/FEED_OPERATIONS_SPEC.md | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 7b7d42bde04..d9260bc9ecf 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -138,7 +138,7 @@ execution will be specified and implemented in a separate effort. │ │ Responsibilities: │ │ │ │ ┌─ Execute plan steps with configurable concurrency │ │ │ │ ├─ Each step calls execute_single_operation() for HTTP │ │ -│ │ ├─ Handle partition splits (re-plan affected ranges) │ │ +│ │ ├─ Handle partition splits (Fetch resolves EPK → PK ranges) │ │ │ │ ├─ Enforce concurrency caps for fan-out │ │ │ │ ├─ Integrate with throughput control │ │ │ │ ├─ Collect step-level diagnostics (timing, concurrency) │ │ @@ -182,11 +182,11 @@ resolution, and for cross-partition queries, potentially a backend query plan fe page. For in-process callers (the common case), this is wasteful: the SDK crate calls -`execute_operation` in a loop, and the plan doesn't change between pages (barring partition -splits). A future optimization could allow `CosmosResponse` and/or `CosmosOperation` to -carry a **cached `OperationPlan`** so that subsequent requests skip re-planning when the -plan is still valid. The cached plan would be invalidated on partition splits (410/1002) or -account metadata changes, falling back to a full re-plan. +`execute_operation` in a loop, and the plan structure doesn't change between pages (Fetch +steps handle partition splits internally by re-resolving EPK ranges). A future optimization +could allow `CosmosResponse` and/or `CosmosOperation` to carry a **cached `OperationPlan`** +so that subsequent requests skip re-planning when the plan is still valid. The cached plan +would be invalidated on account metadata changes, falling back to a full re-plan. This optimization is not required for correctness — the stateless model works correctly today — but should be considered for performance-sensitive workloads with many small pages. @@ -442,6 +442,14 @@ pub(crate) enum OperationPlan { pub(crate) enum PlanStep { /// Execute a single HTTP request via the operation pipeline. /// + /// Each Fetch step targets a specific **EPK range** (not a PK range ID). + /// At execution time, the step resolves its EPK range to the current PK + /// range ID(s) via the `PartitionKeyRangeCache`. If the EPK range maps + /// to multiple PK ranges (due to a partition split), the Fetch step + /// internally issues concurrent requests to all relevant PK ranges — + /// there is no requirement that the concurrency semaphore issues one + /// permit per step, and splits do not require mutating the plan graph. + /// /// The `operation` carries the **unrewritten** query from the backend /// query plan, which may contain the `{documentdb-formattableorderbyquery-filter}` /// placeholder token. At execution time, the executor replaces this @@ -1565,24 +1573,28 @@ SDK layer manages pagination and can: ### 9.1 Partition Split During Execution -When a `Fetch` step receives a 410/1002 (Gone — PartitionKeyRangeGone) response: +Fetch steps target **EPK ranges**, not PK range IDs. When a Fetch step receives a 410/1002 +(Gone — PartitionKeyRangeGone) response: 1. **Invalidate** the `PartitionKeyRangeCache` for the affected container. 2. **Re-fetch** the partition key ranges. -3. **Re-plan** the affected step: the original PK range has split into two or more new - ranges. The executor replaces the single `Fetch` step with new `Fetch` steps for each - new range. -4. **Update the `UnorderedMerge` step** (if any) to include the new steps. -5. **Resume execution** with the new steps. - -The continuation token must survive this: since tokens store EPK bounds (not just PK range -IDs), the re-plan can correctly map EPK bounds to the new PK range IDs. +3. **Re-resolve** the Fetch step's EPK range to the new child PK range IDs. The step's EPK + range now maps to multiple PK ranges. +4. **Issue concurrent requests** to all child PK ranges within the step. The plan structure + does not change — the Fetch step internally fans out. There is no requirement that the + concurrency semaphore issues one permit per step; a single Fetch step may hold multiple + concurrent requests after a split. +5. **Resume execution** with the child range results. + +The plan graph remains stable across splits — no steps are added, removed, or rewired. +The continuation token survives because it stores EPK bounds (not PK range IDs), and the +Fetch step re-resolves those bounds to current PK range IDs on each execution. ### 9.2 Error Propagation | Error Scenario | Behavior | |----------------|----------| -| 410/1002 (PartitionKeyRangeGone) | Re-plan affected range(s), retry. | +| 410/1002 (PartitionKeyRangeGone) | Fetch step re-resolves EPK range to child PK ranges, retries. | | 429 (Throttled) | Handled by transport pipeline (backoff + retry). | | 503 (Service Unavailable) | Handled by operation pipeline (region failover). | | 404 (Not Found) — container | Fail the entire feed operation. | @@ -1848,7 +1860,7 @@ For paginated queries: | ReadMany — basic | Read 10 items across 3 partitions, verify all returned. | | ReadMany — missing items | Read items where some don't exist, verify present items returned. | | ReadMany — single partition | All items in one partition, verify no unnecessary fan-out. | -| ReadMany — partition split | Trigger split during ReadMany, verify re-plan and completion. | +| ReadMany — partition split | Trigger split during ReadMany, verify Fetch step re-resolves and completes. | | ReadMany — large set | Read 1000 items, verify server-side pagination within each range works. | | Query — single partition | Execute paginated query, verify continuation threading. | | Query — resume | Execute query, get continuation, pass token back in next call, verify continues. | From 0b506b675b1008abdba60c8495943bbb9191227e Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 23 Apr 2026 11:10:10 -0700 Subject: [PATCH 07/29] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index d9260bc9ecf..3ac1b3b594c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -302,8 +302,8 @@ pub enum OperationTarget { ``` **Implementation note:** `EpkRange` requires `T: Ord + Clone`. The driver's -`EffectivePartitionKey` type currently does not implement `Ord`. This must be added -(via `derive` or manual implementation) before `OperationTarget::EpkRange` can be used. +`EffectivePartitionKey` type already implements `Ord`, so the existing implementation +is sufficient for `OperationTarget::EpkRange`. ```rust impl OperationTarget { From 41a43533d571117336567987c94ddaa4e8e3bf38 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Wed, 29 Apr 2026 00:04:15 +0000 Subject: [PATCH 08/29] spec refinements --- .../docs/FEED_OPERATIONS_SPEC.md | 1886 +++++++---------- 1 file changed, 813 insertions(+), 1073 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 3ac1b3b594c..72d26ffa467 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -1,7 +1,7 @@ # Feed Operations Spec for `azure_data_cosmos_driver` **Status:** Draft / Iterating -**Date:** 2026-04-21 +**Date:** 2026-04-28 **Authors:** (team) **Crate:** `azure_data_cosmos_driver` @@ -19,11 +19,8 @@ 8. [Diagnostics Structure](#8-diagnostics-structure) 9. [Error Handling & Partition Splits](#9-error-handling--partition-splits) 10. [API Semantics & Invariants](#10-api-semantics--invariants) -11. [Configuration Surface](#11-configuration-surface) -12. [Performance & Non-Regression](#12-performance--non-regression) -13. [Migration Plan](#13-migration-plan) -14. [Testing Strategy](#14-testing-strategy) -15. [Future Work](#15-future-work) +11. [Testing Strategy](#11-testing-strategy) +12. [Future Work](#12-future-work) --- @@ -36,10 +33,9 @@ and produce a single response. Operations like `ReadItem`, `UpsertItem`, and `De through `execute_operation`, which drives the operation pipeline (region failover, session tokens, transport retry) and returns a single `CosmosResponse`. -**Feed operations** — queries, read-many, read-all-items, and change feed — are fundamentally -different. They produce multiple pages of results, may fan out across partition key ranges, may -require backend-provided query plans, and need pagination state that can be serialized across -request boundaries. +**Feed operations** — read-all-items, queries, read-many, and change feed — are fundamentally +different. They produce multiple pages of results, may span multiple partition key ranges, and +need pagination state that can be serialized across request boundaries. Today, feed operations are handled entirely in the higher-level `azure_data_cosmos` crate, bypassing the driver's operation pipeline. This means feed operations miss out on the driver's multi-region @@ -48,8 +44,8 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ### Goals 1. **Unified execution model** — Both point and feed operations flow through a common - Plan → Execute pipeline. Point operations produce a trivial single-step plan. Feed operations - produce multi-step plans that leverage the existing point-operation pipeline for individual + Plan → Execute pipeline. Point operations produce a trivial single-node plan. Feed operations + produce multi-node plans that leverage the existing point-operation pipeline for individual HTTP requests. 2. **Resumable pagination** — Feed operations produce a typed continuation token that can be @@ -57,9 +53,9 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i Resuming with a valid continuation token and an equivalent operation descriptor continues where the previous execution left off. -3. **Extensible operation model** — The plan model must support ReadMany (the initial target), - cross-partition queries, single-partition queries/reads, and change feed, even if some are - implemented later. +3. **Extensible operation model** — The plan model must support ReadAll (the initial target), + cross-partition queries, single-partition queries/reads, read-many, and change feed, even if + some are implemented later. 4. **Driver-level concerns** — Feed operations must integrate with multi-region failover, partition-level failover (PPAF/PPCB), throughput control, session consistency, and @@ -67,32 +63,42 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i 5. **Schema-agnostic pages** — The driver returns response pages as raw bytes (`Vec`). The higher-level SDK handles deserialization, consistent with the existing `CosmosResponse` - model. Future work (sort, aggregate) will require the driver to understand feed envelopes, - but the initial design reserves space for this without requiring it. + model. 6. **Performance non-regression** — Point operations must not pay measurable overhead for the - unified plan model. Trivial plans must be allocation-light. + unified plan model. Trivial plans must be allocation-light. No heap allocation for trivial + plans beyond what `execute_operation` does today. No additional async machinery (no spawning, + no channels) for single-node plans. ### Non-Goals (This Spec) - Full cross-partition query execution with ORDER BY merge-sort and aggregation (future work). - Change feed full design (future work; this spec reserves extension points). +- ReadMany fan-out with concurrent partition fetching (future work). - Client-side query rewriting or optimization. +- Concurrent partition fetching or merge steps. ### Primary Target -**ReadMany** is the first feed operation to implement. It exercises: +**ReadAll** is the first feed operation to implement. It reads all documents from a container by +draining partitions sequentially in effective partition key (EPK) order. It exercises: + - Partition key range resolution (via `PartitionKeyRangeCache`) -- Fan-out across multiple partition key ranges -- Merging results into a single response +- Sequential traversal across partition key ranges in EPK order +- EPK range filtering via `x-ms-documentdb-epk-min` and `x-ms-documentdb-epk-max` headers +- Paginated reads within each partition +- Continuation token serialization and resume across SDK versions - Integration with the operation pipeline for each sub-request -This spec is **complete when ReadMany works end-to-end**. Sections covering cross-partition -queries, ORDER BY merge-sort, continuation token resume strategies (`Drain`, `OrderBy`), and -`resume_filter` on `PlanStep::Fetch` are included to validate the architecture's extensibility -— they demonstrate that the core plan/execute model accommodates these future scenarios without -redesign. However, their designs are **forward-looking and not locked**; cross-partition query -execution will be specified and implemented in a separate effort. +This spec is **complete when ReadAll works end-to-end** through the Plan → Execute pipeline. +Sections on continuation tokens and the plan model are designed to be extensible for future +operations (ReadMany, cross-partition query, change feed) without requiring a redesign. + +**Ordering semantics:** ReadAll drains partitions in EPK order as an implementation behavior. +Within each partition, items are returned in (PartitionKey, ID) ascending order — the natural +sort order of `SELECT *`. This is a driver-emitted ordering, **not** a service-level ordering +guarantee. The service does not guarantee global cross-partition order without explicit +`ORDER BY`. --- @@ -111,7 +117,7 @@ execution will be specified and implemented in a separate effort. │ │ │ Internally: │ │ 1. Planner creates an OperationPlan │ -│ 2. PlanExecutor runs one turn of the plan │ +│ 2. PlanExecutor runs one page of the plan │ │ 3. Returns CosmosResponse (with optional continuation token) │ │ │ │ ┌──────────────────────────────────────────────────────────────────────────┐ │ @@ -122,10 +128,9 @@ execution will be specified and implemented in a separate effort. │ │ │ │ │ │ Responsibilities: │ │ │ │ ┌─ Determine targeting (point EPK, sub-range, full key space) │ │ -│ │ ├─ For ReadMany: group items by PK range, create fan-out steps │ │ -│ │ ├─ For cross-partition query: fetch backend query plan, create steps │ │ -│ │ ├─ For single-partition ops: create single-step plan │ │ -│ │ └─ For point ops: create trivial single-step plan │ │ +│ │ ├─ For ReadAll: resolve PK ranges, create Drain over Fetch nodes │ │ +│ │ ├─ For single-partition ops: create single-node plan │ │ +│ │ └─ For point ops: create trivial single-node plan │ │ │ └──────────────────────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ @@ -133,15 +138,12 @@ execution will be specified and implemented in a separate effort. │ │ PLAN EXECUTOR │ │ │ │ │ │ │ │ Input: OperationPlan │ │ -│ │ Output: CosmosResponse (single turn / single page) │ │ +│ │ Output: CosmosResponse (single page) │ │ │ │ │ │ │ │ Responsibilities: │ │ -│ │ ┌─ Execute plan steps with configurable concurrency │ │ -│ │ ├─ Each step calls execute_single_operation() for HTTP │ │ +│ │ ┌─ Execute one Fetch node via execute_single_operation() │ │ │ │ ├─ Handle partition splits (Fetch resolves EPK → PK ranges) │ │ -│ │ ├─ Enforce concurrency caps for fan-out │ │ -│ │ ├─ Integrate with throughput control │ │ -│ │ ├─ Collect step-level diagnostics (timing, concurrency) │ │ +│ │ ├─ Collect node-level diagnostics (timing) │ │ │ │ └─ Produce continuation token in response (if more pages remain) │ │ │ └──────────────────────────────────────────────────────────────────────────┘ │ │ │ │ @@ -167,31 +169,28 @@ individual HTTP request within a plan. | Concern | Component | Location | |---------|-----------|----------| | Operation intent & payload | `CosmosOperation` | `models/cosmos_operation.rs` | -| Plan creation | `Planner` | `driver/feed/planner.rs` (new) | -| Plan model | `OperationPlan`, `PlanStep` | `driver/feed/plan.rs` (new) | -| Plan execution | `PlanExecutor` | `driver/feed/executor.rs` (new) | +| Plan creation | `Planner` | `driver/plan/planner.rs` (new) | +| Plan model | `OperationPlan`, `PlanNode` | `driver/plan/plan.rs` (new) | +| Plan execution | `PlanExecutor` | `driver/plan/executor.rs` (new) | | Continuation state | `ContinuationToken` | `models/continuation_token.rs` (new) | -| Per-step HTTP execution | `execute_single_operation` | `driver/pipeline/` (existing) | +| Per-node HTTP execution | `execute_single_operation` | `driver/pipeline/` (existing) | ### Open Issue: Re-Planning on Every Page Because `execute_operation` is stateless, the driver must re-plan the operation on every call — including subsequent pages of a paginated feed. The Planner uses the continuation token to reconstruct the plan state, but still performs the full planning step (PK range -resolution, and for cross-partition queries, potentially a backend query plan fetch) on each -page. +resolution) on each page. For in-process callers (the common case), this is wasteful: the SDK crate calls `execute_operation` in a loop, and the plan structure doesn't change between pages (Fetch -steps handle partition splits internally by re-resolving EPK ranges). A future optimization +nodes handle partition splits internally by re-resolving EPK ranges). A future optimization could allow `CosmosResponse` and/or `CosmosOperation` to carry a **cached `OperationPlan`** so that subsequent requests skip re-planning when the plan is still valid. The cached plan would be invalidated on account metadata changes, falling back to a full re-plan. This optimization is not required for correctness — the stateless model works correctly today — but should be considered for performance-sensitive workloads with many small pages. -The cached operation plan subsumes the backend query plan (which the Planner consumes during -planning and does not need afterward), so no separate query plan caching is needed. --- @@ -216,28 +215,17 @@ would break `Copy` and mix operation semantics with operation payload — we spl /// Each variant carries exactly the data needed for its operation type. #[derive(Clone, Debug)] pub enum OperationPayload { - /// No payload needed (e.g., ReadItem, DeleteItem, ReadContainer). + /// No payload needed (e.g., ReadItem, DeleteItem, ReadContainer, ReadAllItems). None, /// Raw body bytes (e.g., CreateItem, UpsertItem, ReplaceItem). /// The caller provides pre-serialized JSON. Body(Vec), - /// SQL query text with optional parameters (e.g., QueryItems). - Query { - /// The SQL query text. - query: String, - /// Pre-serialized parameters JSON array, if any. - parameters: Option>, - }, - - /// ReadMany item descriptors: (item_id, partition_key) pairs. - ReadMany { - /// The items to read, as (id, partition_key) pairs. - items: Vec<(String, PartitionKey)>, - }, - - // Future: ChangeFeed { mode, start_from, ... } + // Future variants: + // Query { query: String, parameters: Option> }, + // ReadMany { items: Vec<(String, PartitionKey)> }, + // ChangeFeed { mode, start_from, ... }, } ``` @@ -319,11 +307,6 @@ impl OperationTarget { } ``` -**Future optimization:** `EpkRange` could gain an optional PK range ID hint to skip the -cache lookup when the mapping is already known (e.g., from a previous routing decision or a -cached plan). The hint would be advisory — the pipeline would fall back to EPK-based -resolution if the hint is stale after a partition split. - ### 3.3 Factory Method Updates Existing factory methods are updated to use `OperationPayload` and `OperationTarget`: @@ -350,50 +333,13 @@ impl CosmosOperation { // Caller attaches body via .with_payload(OperationPayload::Body(...)) } - /// Queries items within a single partition. - pub fn query_items( - container: ContainerReference, - partition_key: PartitionKey, - query: impl Into, - ) -> Self { + /// Reads all items across all partitions. + pub fn read_all_items(container: ContainerReference) -> Self { let resource_ref = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) - .with_target(OperationTarget::PartitionKey(partition_key)) - .with_payload(OperationPayload::Query { - query: query.into(), - parameters: None, - }) - } - - /// Queries items across all partitions. - pub fn query_items_cross_partition( - container: ContainerReference, - query: impl Into, - ) -> Self { - let resource_ref = CosmosResourceReference::from(container) - .with_resource_type(ResourceType::Document) - .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) - .with_target(OperationTarget::all_ranges()) - .with_payload(OperationPayload::Query { - query: query.into(), - parameters: None, - }) - } - - /// Reads multiple items by their ID/partition-key pairs. - pub fn read_many( - container: ContainerReference, - items: Vec<(String, PartitionKey)>, - ) -> Self { - let resource_ref = CosmosResourceReference::from(container) - .with_resource_type(ResourceType::Document) - .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) + Self::new(OperationType::ReadFeed, resource_ref) .with_target(OperationTarget::all_ranges()) - .with_payload(OperationPayload::ReadMany { items }) } } ``` @@ -406,9 +352,9 @@ method or via `.with_payload(...)`. A convenience method `with_body(Vec)` ca sugar for `with_payload(OperationPayload::Body(...))`. The transport pipeline's request builder must be updated to extract body bytes from -`OperationPayload` when constructing the HTTP request. For `Body` and `Query` variants, this -is straightforward serialization. For `ReadMany`, the Planner decomposes the operation before -it reaches the transport pipeline, so the transport never sees a `ReadMany` payload directly. +`OperationPayload` when constructing the HTTP request. For `Body` variants, this is +straightforward. For `None`, no body is sent. Future payload variants (Query, ReadMany) +will be handled by the Planner before reaching the transport pipeline. --- @@ -416,167 +362,212 @@ it reaches the transport pipeline, so the transport never sees a `ReadMany` payl ### 4.1 Plan Model -An `OperationPlan` describes the steps needed to execute an operation. It is an enum with -two variants: `Trivial` for single-step plans (stack-allocated, no heap overhead) and -`MultiStep` for fan-out plans. +An `OperationPlan` describes the nodes needed to execute an operation. The Planner builds an +Operation Plan which is made up of Nodes. Each Node represents an operation in the pipeline. + +Rust's ownership model does not lend itself well to owning tree structures with parent-child +references. Instead, the plan uses a **flat list of nodes** with index-based references: + +- **`NodeId`** is an offset into the plan's node list, used for parent-child relationships. +- **`NodeRange`** is a `[start, end)` pair of `NodeId` values representing a contiguous slice + of children, avoiding a separate `Vec` heap allocation. +- Nodes are stored **bottom-up**: child nodes always appear before their parents in the list. + This makes `NodeId` values stable and deterministic — the same plan input always produces + the same node ordering. + +```rust +/// Index of a node within an `OperationPlan::Graph`'s node list. +/// +/// NodeIds are stable within a plan: the same inputs produce the same +/// node ordering. Children always have lower NodeIds than their parents +/// (bottom-up invariant). +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct NodeId(u32); + +/// A contiguous range of node indices `[start, end)`. +/// +/// Used to reference a slice of children without a separate heap allocation. +/// Children in a NodeRange are always contiguous in the node list because +/// they are built together by the planner. +#[derive(Clone, Copy, Debug)] +pub(crate) struct NodeRange { + pub start: NodeId, + pub end: NodeId, +} + +impl NodeRange { + pub fn len(&self) -> usize { + (self.end.0 - self.start.0) as usize + } + + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + pub fn iter(&self) -> impl Iterator { + (self.start.0..self.end.0).map(NodeId) + } +} +``` ```rust /// A plan for executing an operation. /// -/// Plans range from trivial (single step for a point read) to complex -/// (fan-out across partition key ranges with merge). The plan is created +/// Plans range from trivial (single node for a point read) to multi-node +/// (sequential drain across partition key ranges). The plan is created /// by the Planner and executed by the PlanExecutor. pub(crate) enum OperationPlan { - /// A single-step plan. Stack-allocated, no heap overhead. + /// A single-node plan, stored inline. No heap allocation. /// Used for point operations and single-partition feed operations. - Trivial(PlanStep), - - /// A multi-step plan. The last step in the Vec is the output step. - /// Used for fan-out operations (ReadMany, cross-partition queries). - MultiStep { - steps: Vec, + SingleNode(PlanNode), + + /// A multi-node plan stored as a flat list of nodes. + /// Nodes are stored bottom-up: children appear before parents. + /// Used for cross-partition feed operations (e.g., ReadAll). + Graph { + /// The flat list of nodes. Children appear before parents. + nodes: Vec, + /// The root node of the plan (always the last node in the list). + root: NodeId, }, } +``` -/// A single step in an operation plan. -pub(crate) enum PlanStep { +```rust +/// A node in an operation plan. +/// +/// Nodes reference each other via `NodeId` and `NodeRange` within the +/// flat node list. Composite nodes (Drain) reference child nodes; +/// leaf nodes (Fetch) have no children. +pub(crate) enum PlanNode { /// Execute a single HTTP request via the operation pipeline. /// - /// Each Fetch step targets a specific **EPK range** (not a PK range ID). - /// At execution time, the step resolves its EPK range to the current PK + /// Each Fetch node targets a specific **EPK range** (not a PK range ID). + /// At execution time, the node resolves its EPK range to the current PK /// range ID(s) via the `PartitionKeyRangeCache`. If the EPK range maps - /// to multiple PK ranges (due to a partition split), the Fetch step - /// internally issues concurrent requests to all relevant PK ranges — - /// there is no requirement that the concurrency semaphore issues one - /// permit per step, and splits do not require mutating the plan graph. - /// - /// The `operation` carries the **unrewritten** query from the backend - /// query plan, which may contain the `{documentdb-formattableorderbyquery-filter}` - /// placeholder token. At execution time, the executor replaces this - /// token with the `resume_filter` (if present) via simple string - /// substitution before sending the request. + /// to multiple PK ranges (due to a partition split), the Fetch node + /// internally re-resolves and issues requests to the appropriate child + /// PK ranges. The next time the plan is generated, the EPK ranges will + /// reflect the split, and the plan resumes with the new ranges. Fetch { - /// The operation to execute. Targeted to a specific PK range. - /// Wrapped in `Arc` so that fan-out steps can share the base + /// The operation to execute, targeted to a specific EPK range. + /// Wrapped in `Arc` so that sibling Fetch nodes can share the base /// operation without cloning the full payload (headers, resource - /// reference, etc.). Each `Fetch` step holds a reference to - /// a retargeted `CosmosOperation` — for fan-out, these share - /// the immutable parts of the original operation. - /// For ORDER BY queries, the query text contains the - /// `{documentdb-formattableorderbyquery-filter}` placeholder. + /// reference, etc.). operation: Arc, /// Options for this fetch. options: OperationOptions, + /// The EPK range this fetch targets. + epk_range: EpkRange, /// Server-provided continuation token for this range, if resuming. continuation: Option, - /// A filter expression to inject into the query by replacing - /// `{documentdb-formattableorderbyquery-filter}` in the query text. - /// Set by the Planner when resuming from an `OrderBy` continuation. - /// For example: `"c.name > 'Baker'"` or `"c.name >= 'Baker'"`. - /// `None` for first-page execution or non-ORDER BY queries. - resume_filter: Option, }, - /// Merge results from multiple upstream steps with no ordering guarantee. + /// Sequential cross-partition drain. /// - /// Results are concatenated in the order they complete. Used by ReadMany - /// to combine results from multiple PK ranges. For ReadMany, all upstream - /// fetches are driven to completion and their results are concatenated. + /// Enumerates child Fetch nodes in EPK order, draining each partition + /// completely before moving to the next. Each page comes from exactly + /// one partition — pages do not span partition boundaries. /// - /// Note: Cross-partition queries without an explicit ORDER BY still return - /// results in (PartitionKey, ID) ascending order within each partition. - /// The `UnorderedMerge` step concatenates partition results but does NOT - /// sort across partitions. For globally-ordered results, use `OrderedMerge` - /// (future). - UnorderedMerge { - /// Indices of the steps whose results are merged. - inputs: Vec, + /// Within each partition, items are returned in (PartitionKey, ID) + /// ascending order (the natural server sort order). + Drain { + /// Child Fetch nodes, ordered by EPK range. + /// References a contiguous range in the plan's node list. + children: NodeRange, }, // Future variants: - // OrderedMerge { inputs: Vec, order_by: ... }, - // Aggregate { inputs: Vec, aggregation: ... }, + // UnorderedMerge { children: NodeRange }, + // OrderedMerge { children: NodeRange, order_by: ... }, + // Aggregate { children: NodeRange, aggregation: ... }, } ``` -### 4.2 Plan Examples +### 4.2 Bottom-Up Invariant -#### Point Operation (ReadItem) +The flat node list is always built **bottom-up**: leaf nodes (Fetch) are pushed first, +then their parent (Drain) is pushed after them. This produces a deterministic layout where +`NodeId` values are stable for a given set of inputs. -```text -Trivial(Fetch(read_item operation)) → output -``` - -A `Trivial` plan with one `Fetch` step. The executor runs it directly, gets a -`CosmosResponse`, done. No heap allocation. - -#### ReadMany +For a ReadAll plan over 3 partitions, the node list looks like: ```text -Step 0: Fetch(query to PK range "0", items [(pk_a, a), (pk_b, b)]) -Step 1: Fetch(query to PK range "1", items [(pk_c, c)]) -Step 2: Fetch(query to PK range "2", items [(pk_d, d), (pk_e, e), (pk_f, f)]) -Step 3: UnorderedMerge(inputs: [0, 1, 2]) → output +Index Node +───── ────────────────────────────────────────── + 0 Fetch { epk_range: ["","55"), ... } + 1 Fetch { epk_range: ["55","AA"), ... } + 2 Fetch { epk_range: ["AA","FF"), ... } + 3 Drain { children: NodeRange(0..3) } + +root = NodeId(3) ``` -The executor runs steps 0–2 concurrently (subject to concurrency cap), each driving all -server-side pages to completion. Step 3 merges the fully-buffered results. +The `NodeRange(0..3)` for the Drain's children is a zero-cost reference to the contiguous +slice of Fetch nodes. No `Vec` allocation is needed. -**Optimization:** When a PK range contains only a single item, the Planner MAY optimize -the `Fetch` step to a point read (`OperationType::Read` with a point EPK range target) -instead of a query, avoiding the overhead of query parsing on the backend. +### 4.3 Plan Examples -#### Single-Partition Query +#### Point Operation (ReadItem) ```text -Step 0: Fetch(query to PK "my-pk", continuation: None) → output +SingleNode(Fetch { operation: read_item, epk_range: pk_epk, continuation: None }) ``` -On subsequent turns, the executor updates the continuation in step 0 and re-executes. -Each turn yields one page. +A `SingleNode` plan with one `Fetch` node. The executor runs it directly, gets a +`CosmosResponse`, done. No heap allocation. -#### Cross-Partition Query (Future) +#### ReadAll (Cross-Partition) ```text -Step 0: Fetch(query to PK range "0") -Step 1: Fetch(query to PK range "1") -Step 2: Fetch(query to PK range "2") -Step 3: UnorderedMerge(inputs: [0, 1, 2]) → output - (or Step 3: OrderedMerge for explicit ORDER BY) +Graph { + nodes: [ + 0: Fetch { epk_range: ["","55"), continuation: None }, + 1: Fetch { epk_range: ["55","AA"), continuation: None }, + 2: Fetch { epk_range: ["AA","FF"), continuation: None }, + 3: Drain { children: NodeRange(0..3) }, + ], + root: NodeId(3), +} ``` -Each turn, the executor advances whichever PK range steps have results available. +The executor processes partitions sequentially: +1. Fetch all pages from EPK range `["","55")` until that partition is drained. +2. Move to EPK range `["55","AA")`, fetch all pages. +3. Move to EPK range `["AA","FF")`, fetch all pages. -**Ordering note:** Within each partition, results are always returned in -(PartitionKey, ID) ascending order — even without an explicit `ORDER BY` clause. -The `UnorderedMerge` step concatenates partition results without cross-partition -sorting. For queries with an explicit `ORDER BY`, an `OrderedMerge` step (future) -performs a k-way merge over partition heads to produce globally ordered results. +Each `execute_operation` call produces exactly **one page** from the currently-active +partition. When a partition is fully drained (server returns no continuation), the next +call starts the next partition. A continuation token is returned after each page until +all partitions are exhausted. -### 4.3 Incremental Page Production +#### ReadAll — Resumed from Continuation -Plans MUST support incremental page production. The executor does NOT wait for all partition -steps to complete before emitting a page. Instead: +When resuming from a continuation token that says "active range is `["55","AA")` with +server token `xyz`", the Planner skips already-drained ranges and rebuilds the plan +starting from the active range: -- **Unordered fan-out** (ReadMany, cross-partition query without ORDER BY): Results are - buffered per partition step. For ReadMany, all partitions are driven to completion and - merged (single logical page). For queries, pages are emitted as partitions produce them. - Note that within each partition, results arrive in (PartitionKey, ID) ascending order; - only the cross-partition merge is unordered. - -- **Ordered fan-out** (cross-partition query with explicit ORDER BY, future): A k-way merge - streams items from partition heads. A page is emitted when enough items are available or - a partition produces a page boundary. +```text +Graph { + nodes: [ + 0: Fetch { epk_range: ["55","AA"), continuation: Some("xyz") }, + 1: Fetch { epk_range: ["AA","FF"), continuation: None }, + 2: Drain { children: NodeRange(0..2) }, + ], + root: NodeId(2), +} +``` -- **Single-step plans**: Each turn is one HTTP request, one page. +Only the remaining partitions are in the plan. The first Fetch carries the server +continuation from the token. -### 4.4 Trivial Plan Optimization +### 4.4 SingleNode Optimization For point operations, the plan model MUST be zero or near-zero overhead compared to the current -direct `execute_single_operation` call. The `OperationPlan::Trivial` variant ensures this: +direct `execute_single_operation` call. The `OperationPlan::SingleNode` variant ensures this: -- **No heap allocation**: The single `PlanStep` is stored inline in the enum, not in a `Vec`. -- **No graph traversal**: The executor matches on `Trivial` and directly calls +- **No heap allocation**: The single `PlanNode` is stored inline in the enum, not in a `Vec`. +- **No graph traversal**: The executor matches on `SingleNode` and directly calls `execute_single_operation`. --- @@ -585,9 +576,8 @@ direct `execute_single_operation` call. The `OperationPlan::Trivial` variant ens ### 5.1 Responsibilities -The Planner transforms a `CosmosOperation` into an `OperationPlan`. It is a synchronous, -deterministic function for most operations, but MAY need to perform async I/O for cross-partition -queries (fetching a backend query plan). +The Planner transforms a `CosmosOperation` into an `OperationPlan`. For ReadAll, this is +synchronous: resolve partition key ranges and build a `Drain` node over `Fetch` children. ```rust pub(crate) struct Planner<'a> { @@ -599,8 +589,7 @@ impl<'a> Planner<'a> { /// Creates an operation plan from a CosmosOperation. /// /// For point operations, this is synchronous and trivial. - /// For feed operations, this may need to resolve PK ranges - /// and (for cross-partition queries) fetch a backend query plan. + /// For ReadAll, this resolves PK ranges and builds a Drain plan. pub async fn plan( &self, operation: &CosmosOperation, @@ -608,8 +597,6 @@ impl<'a> Planner<'a> { continuation: Option<&ContinuationToken>, // Callback for fetching PK ranges (keeps Planner transport-decoupled). fetch_pk_ranges: impl Fn(...) -> ..., - // Callback for fetching query plans (keeps Planner transport-decoupled). - fetch_query_plan: impl Fn(...) -> ..., ) -> azure_core::Result { // ... } @@ -620,176 +607,119 @@ impl<'a> Planner<'a> { | Operation | Targeting | Plan Strategy | |-----------|-----------|---------------| -| ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Fetch` step. Trivial. | -| CreateDatabase, ReadContainer, etc. | `None` | Single `Fetch` step. Trivial. | -| QueryItems (single partition) | `PartitionKey` | Single `Fetch` step. Paginated. | -| ReadAllItems (single partition) | `PartitionKey` | Single `Fetch` step. Paginated. | -| QueryItems (cross-partition) | `EpkRange` (`all_ranges()`) | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. May fetch query plan. | -| ReadMany | `EpkRange` (`all_ranges()`) | Group items by PK range → N `Fetch` steps + `UnorderedMerge`. No pagination. | -| ReadAllItems (cross-partition) | `EpkRange` (`all_ranges()`) | Resolve PK ranges → N `Fetch` steps + `UnorderedMerge`. Paginated. | -| ChangeFeed (future) | varies | TBD | +| ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Fetch` node. SingleNode. | +| CreateDatabase, ReadContainer, etc. | `None` | Single `Fetch` node. SingleNode. | +| ReadAllItems (single partition) | `PartitionKey` | Single `Fetch` node. Paginated. | +| ReadAllItems (cross-partition) | `EpkRange` (`all_ranges()`) | Resolve PK ranges → `Drain` over N `Fetch` nodes. Sequential. | -### 5.3 Operation Decomposition: From One `CosmosOperation` to Many +### 5.3 Pseudo-Code: Building a Trivial Plan -A key responsibility of the Planner is decomposing a single caller-provided `CosmosOperation` -into multiple targeted `CosmosOperation` instances — one per partition key range — that each -flow through `execute_single_operation` independently. This section illustrates the full -decomposition for two representative operations. - -#### Example: Cross-Partition Query - -The caller creates a single operation: +The following pseudo-code illustrates how the Planner constructs a plan for a point +operation or single-partition feed: ```rust -let op = CosmosOperation::query_items_cross_partition( - container.clone(), - "SELECT * FROM c WHERE c.status = 'active'", -); -// op.target == OperationTarget::all_ranges() (full EPK range ["", "FF")) -// op.payload == OperationPayload::Query { query: "SELECT ...", parameters: None } -``` - -The Planner first fetches a **backend query plan** from the service (see [§5.4](#54-query-plan-fetching)) -to determine how the query should be distributed across partitions — including whether -client-side sort or aggregation is required. It then resolves the container's partition key -ranges (say, ranges "0", "1", "2") and uses the backend query plan to assemble an -`OperationPlan` with **three separate `CosmosOperation`** instances: - -```text -Caller's CosmosOperation - target: EpkRange ["", "FF") (all_ranges()) - payload: Query { "SELECT * FROM c WHERE c.status = 'active'" } - │ - ▼ - ┌─── Planner ──────────────────────────────────┐ - │ 1. Fetch backend query plan (via §5.4) │ - │ 2. Resolve PK ranges: 0, 1, 2 │ - │ 3. Assemble plan from query plan + PK ranges │ - └───────┬──────────────────────────────────────┘ - ┌───────────┼───────────┐ - ▼ ▼ ▼ - CosmosOperation CosmosOperation CosmosOperation - type: Query type: Query type: Query - target: target: target: - EpkRange EpkRange EpkRange - ["","55") ["55","AA") ["AA","FF") - payload: payload: payload: - Query{rewritten} Query{rewritten} Query{rewritten} - │ │ │ - ▼ ▼ ▼ - execute_single execute_single execute_single - _operation() _operation() _operation() - │ │ │ - ▼ ▼ ▼ - CosmosResponse CosmosResponse CosmosResponse - │ │ │ - └───────────────┼───────────────┘ - ▼ - UnorderedMerge - │ - ▼ - CosmosResponse +// PSEUDO-CODE — illustrative, not compilable +fn plan_trivial(operation: CosmosOperation, options: OperationOptions) -> OperationPlan { + OperationPlan::SingleNode(PlanNode::Fetch { + epk_range: operation.target().as_epk_range(), + operation: Arc::new(operation), + options, + continuation: None, + }) +} ``` -Each decomposed `CosmosOperation` is **retargeted** to a specific EPK range and wrapped in -`Arc` so that fan-out steps share the immutable parts of the operation (headers, resource -reference, etc.) without cloning. The Planner creates the retargeted operations and wraps -each in an `Arc`; the executor passes `Arc` to `execute_single_operation`. -Note that the query payload may differ from the caller's original SQL: the backend query -plan may **rewrite the query** (e.g., to push down aggregations, add internal projections, -or restructure filters for per-partition execution), and the Planner uses the rewritten -query text in the decomposed operations. The operation pipeline handles region failover, -retry, and auth for each independently. +No PK range resolution is needed. The operation is wrapped in a single `Fetch` node. -#### Example: ReadMany +### 5.4 Pseudo-Code: Building a ReadFeed Plan -The caller creates one operation with 5 items across 3 PK ranges: +The following pseudo-code illustrates how the Planner constructs a cross-partition ReadAll +plan, including resume from a continuation token: ```rust -let op = CosmosOperation::read_many(container.clone(), vec![ - ("id_a".into(), PartitionKey::from("pk_a")), - ("id_b".into(), PartitionKey::from("pk_b")), - ("id_c".into(), PartitionKey::from("pk_c")), - ("id_d".into(), PartitionKey::from("pk_d")), - ("id_e".into(), PartitionKey::from("pk_e")), -]); -``` - -The Planner computes EPKs for each partition key, groups by PK range, and produces: - -```text -Caller's CosmosOperation - target: EpkRange ["", "FF") (all_ranges()) - payload: ReadMany { items: [(id_a,pk_a), (id_b,pk_b), (id_c,pk_c), (id_d,pk_d), (id_e,pk_e)] } - │ - ▼ - ┌─── Planner ──────────────────────────────────────────┐ - │ EPK(pk_a),EPK(pk_b) → PK range "0" │ - │ EPK(pk_c) → PK range "1" (single item!) │ - │ EPK(pk_d),EPK(pk_e) → PK range "2" │ - └───────┬──────────────────────────────────────────────┘ - ┌───────────┼───────────┐ - ▼ ▼ ▼ - CosmosOperation CosmosOperation CosmosOperation - type: Query type: Read type: Query - target: target: target: - EpkRange EpkRange EpkRange - ["","55") [EPK(pk_c), ["AA","FF") - EPK(pk_c)) - payload: payload: payload: - Body{query on None (point Body{query on - (pk_a,id_a), read of id_c) (pk_d,id_d), - (pk_b,id_b)} (pk_e,id_e)} -``` - -Note two things: -1. The ReadMany query for each PK range filters on **both partition key and ID**, because - ID alone is not unique — only (PartitionKey, ID) is unique within a container. -2. PK range "1" contains only a single item, so the Planner **optimizes it to a point read** - (`OperationType::Read` with a point EPK range), avoiding query overhead. - -Each decomposed operation then flows through `execute_single_operation` independently. - -### 5.4 Query Plan Fetching - -For cross-partition queries, the Planner may need a backend query plan to determine: -- Which partitions to target -- Whether the query requires client-side sort/aggregate -- Optimized partition routing +// PSEUDO-CODE — illustrative, not compilable +fn plan_read_feed( + operation: &CosmosOperation, + pk_ranges: &[PartitionKeyRange], + continuation: Option<&ContinuationToken>, +) -> OperationPlan { + // Determine where to start: either from a continuation token or the beginning. + let (start_epk, server_token) = match continuation { + Some(token) => { + let state = token.resume_state(); + (state.epk_min(), state.server_token().cloned()) + } + None => (EffectivePartitionKey::MIN, None), + }; + + // Build Fetch nodes bottom-up, one per PK range that hasn't been drained. + let shared_op = Arc::new(create_fetch_from(operation)); + let mut nodes = Vec::new(); + + let remaining_ranges = pk_ranges + .iter() + .filter(|r| r.max_epk() > start_epk); + + let mut is_first_remaining = true; + for range in remaining_ranges { + let continuation = if is_first_remaining { + is_first_remaining = false; + server_token.clone() + } else { + None + }; + + nodes.push(PlanNode::Fetch { + operation: Arc::clone(&shared_op), + options: derive_fetch_options(range), + epk_range: range.epk_range(), + continuation, + }); + } -The Planner uses a **callback** to fetch the query plan, keeping it transport-decoupled. The -callback internally calls `execute_single_operation` (not `execute_operation`), avoiding -re-entry into the Planner. The `OperationType::QueryPlan` variant already exists for this. + // Push the Drain node after all its children (bottom-up invariant). + let children = NodeRange { + start: NodeId(0), + end: NodeId(nodes.len() as u32), + }; + nodes.push(PlanNode::Drain { children }); -```rust -// The Planner calls this callback, which the driver wires to -// execute_single_operation directly (bypassing the Planner). -async fn fetch_query_plan( - operation: &CosmosOperation, - options: &OperationOptions, -) -> azure_core::Result { - let query_plan_op = CosmosOperation::query_plan( - operation.container().unwrap().clone(), - /* query text from operation payload */ - ); - let response = execute_single_operation(query_plan_op, options, ...).await?; - BackendQueryPlan::from_response(response) + let root = NodeId(nodes.len() as u32 - 1); + OperationPlan::Graph { nodes, root } } ``` -This avoids the recursion concern: `fetch_query_plan` calls `execute_single_operation` -directly, which is the internal pipeline function, not the public `execute_operation` that -goes through the Planner. +Key points: +- Fetch nodes are pushed first (children), then the Drain (parent) — maintaining the + bottom-up invariant. +- On resume, ranges left of the continuation's EPK min are skipped entirely. The first + remaining Fetch carries the server token from the continuation. +- All Fetch nodes share the base operation via `Arc`, avoiding clones of headers and + resource references. ### 5.5 Resuming from a Continuation Token When a `ContinuationToken` is provided, the Planner validates it (version, container RID, -operation compatibility), resolves the current partition key ranges, and walks the nested -`ResumeState` tree to reconstruct the plan with the correct per-step state. +operation kind), resolves the current partition key ranges, and uses the token's resume +state to reconstruct the plan at the correct position. -The full resume algorithm — including left/target/right partition classification, filter -generation for ORDER BY, and partition split handling — is described in -[§7.3 Resume Strategy](#73-resume-strategy). +The resume algorithm for `Drain` is described in [§7.3 Resume Strategy](#73-resume-strategy). + +### 5.6 Future Extensions + +The Planner architecture supports future operations without redesign: + +- **ReadMany**: Group items by PK range, create concurrent `Fetch` nodes with an + `UnorderedMerge` parent. Requires adding concurrency support to the PlanExecutor. +- **Cross-partition query**: Fetch a backend query plan, create `Fetch` nodes per + partition, optionally with `OrderedMerge` for ORDER BY queries. +- **Change feed**: Create `Fetch` nodes scoped to feed ranges with change-feed-specific + continuation state. Add a parent merge node based on change-feed merge semantics. +- **Concurrency management**: All plan nodes receive a **concurrency permit** (semaphore + token) during execution. For ReadAll, the executor holds a single permit — sequential + by design. Future operations (ReadMany, cross-partition queries) will acquire multiple + permits from a shared semaphore, allowing the PlanExecutor to control the degree of + parallelism across nodes without changing the plan model. --- @@ -797,119 +727,126 @@ generation for ORDER BY, and partition split handling — is described in ### 6.1 Core Execution Loop -The Plan Executor runs an `OperationPlan` and produces pages of results. +The Plan Executor runs an `OperationPlan` and produces one page of results per call. ```rust -pub(crate) struct PlanExecutor { - plan: OperationPlan, - /// Per-step state (continuation, completion status). - step_states: Vec, - /// Concurrency control for fan-out. - concurrency_limit: usize, - /// Diagnostics builder for collecting step-level timing. - diagnostics: DiagnosticsContextBuilder, -} +pub(crate) struct PlanExecutor; impl PlanExecutor { - /// Executes one turn of the plan, producing a `CosmosResponse`. + /// Executes one page of the plan, producing a `CosmosResponse`. /// /// The response includes a continuation token if more pages are available. - /// For non-paginating plans (ReadMany), this drives all steps to completion - /// and returns the merged result with no continuation token. + /// Each call executes exactly one HTTP request to one partition. pub async fn execute( - &mut self, + plan: &OperationPlan, driver_context: &DriverContext, + diagnostics: &mut DiagnosticsContextBuilder, ) -> azure_core::Result { // ... } } ``` -### 6.2 Turn Execution - -Each call to `execute`: - -1. **Record step enqueue** — mark each step as enqueued for concurrency tracking. -2. **Identify runnable steps** — steps whose dependencies are satisfied. -3. **Execute runnable steps concurrently** (up to concurrency cap), each via - `execute_single_operation`. -4. **Collect results** from completed steps. -5. **Advance continuation state** for steps that returned server continuations. -6. **Execute dependent steps** (e.g., `UnorderedMerge`) when their inputs are ready. -7. **Produce the page** from the output step's result. -8. **Update step states** for the next turn. - -### 6.3 Concurrency Control - -Fan-out steps are executed with a configurable concurrency cap: +The following pseudo-code illustrates the core execution loop for a `Drain` plan. +Function names are descriptive; their implementations are not shown. ```rust -/// Maximum number of concurrent partition key range fetches. -/// -/// Defaults to `min(num_pk_ranges, 10)`. Configurable via -/// `OperationOptions::max_concurrency`. -concurrency_limit: usize, +// PSEUDO-CODE — illustrative, not compilable +async fn execute_plan( + plan: &OperationPlan, + driver_context: &DriverContext, + diagnostics: &mut DiagnosticsContextBuilder, +) -> Result { + match plan { + OperationPlan::SingleNode(fetch) => { + // Point ops and single-partition feeds: execute directly. + execute_fetch_node(fetch, driver_context, diagnostics).await + } + OperationPlan::Graph { nodes, root } => { + let root_node = &nodes[root.0 as usize]; + execute_node(root_node, nodes, driver_context, diagnostics).await + } + } +} + +async fn execute_node( + node: &PlanNode, + all_nodes: &[PlanNode], + driver_context: &DriverContext, + diagnostics: &mut DiagnosticsContextBuilder, +) -> Result { + match node { + PlanNode::Fetch { .. } => { + execute_fetch_node(node, driver_context, diagnostics).await + } + PlanNode::Drain { children } => { + // Find the active child: the first Fetch that hasn't been drained. + // On a fresh plan, this is children.start. On resume, the Planner + // has already pruned drained partitions, so children.start is the + // active one. + let active_id = children.start; + let active_fetch = &all_nodes[active_id.0 as usize]; + + // Acquire a concurrency permit (sequential: only one permit). + let _permit = acquire_concurrency_permit(driver_context).await; + + // Execute one page from the active partition. + let response = execute_fetch_node( + active_fetch, driver_context, diagnostics + ).await?; + + // Build the continuation token based on what happened. + let continuation = build_drain_continuation( + &response, active_fetch, active_id, children, all_nodes + ); + + Ok(response.with_continuation(continuation)) + } + } +} ``` -The executor uses a semaphore or similar mechanism to limit concurrent -`execute_single_operation` calls. Each concurrent call independently goes through the -full operation pipeline (region failover, retry, etc.). - -### 6.4 ReadMany Execution Details - -ReadMany is the initial target. Its execution: - -1. **Planner** groups `(id, partition_key)` pairs by PK range (via `PartitionKeyRangeCache`). -2. **Plan** has N `Fetch` steps (one per PK range) + one `UnorderedMerge` step. -3. **Executor** runs all `Fetch` steps concurrently (up to concurrency limit). -4. Each `Fetch` step sends a query to its PK range. The query body encodes **both the item IDs - and the partition keys** for that range, because ID alone is not unique — only the - (PartitionKey, ID) pair is unique within a container. If the response includes a server - continuation, the executor continues fetching that range until all items are retrieved. -5. **UnorderedMerge** step concatenates results from all ranges. -6. Returns a single `CosmosResponse` containing all items (with no continuation token). -7. Subsequent calls with the same operation (no continuation) would re-execute from scratch. - -**Optimization:** When a PK range contains only a single item, the Planner optimizes the -`Fetch` step to a point read instead of a query (see §4.2). - -**Semantics:** -- **Missing items**: Items not found are silently omitted from the result. The response does - not indicate which items were not found. -- **Order**: Output order is NOT guaranteed to match input order. Items are grouped by - partition key range. -- **Partial failure**: If any PK range fetch fails after exhausting retries, the entire - ReadMany operation fails. Partial results are not returned. - -### 6.5 Backpressure & Cancellation - -- **Caller drops the future**: In-flight `execute_single_operation` futures are - cancelled via standard Rust drop semantics. The executor does not buffer results beyond - what is needed for the current turn. -- **Memory bounds**: The executor does not buffer more than `concurrency_limit` concurrent - page results. For ReadMany (which buffers all results), the total buffered data is bounded - by the total size of all items — the caller controls this by the size of the input list. -- **Cancellation mid-turn**: If the caller cancels (drops the future) during a turn, any - in-flight HTTP requests are dropped. The continuation token from the *previous* completed - turn remains valid for resumption. +### 6.2 Backpressure & Cancellation + +- **Caller drops the future**: The in-flight `execute_single_operation` future is + cancelled via standard Rust drop semantics. +- **Memory bounds**: Each call buffers at most one page of results. +- **Cancellation mid-page**: If the caller cancels during a page fetch, the continuation + token from the *previous* completed call remains valid for resumption. --- ## 7. Continuation Tokens -### 7.1 Design Principle: O(1) Token Size +### 7.1 Design Principles -A container may have many physical partitions. Storing per-range continuation state -for every partition would make the token size linear in partition count — unacceptable for -tokens that must cross HTTP request boundaries (e.g., sent to a browser in a URL or header). +Continuation tokens must be: -Instead, the continuation token stores the state of **exactly one partition key range** — the -range where execution last yielded results. On resume, the Planner reconstructs the positions -of all other partitions using **query filter rewriting** rather than stored server tokens. +1. **Durable across SDK versions** — A token produced by SDK version N must be usable by + SDK version N+k. Tokens may be stored durably (e.g., in a database) or transiently + (e.g., in a URL parameter) and must survive SDK upgrades. Newer SDKs MUST support reading + tokens from older SDKs. Changing the token format dramatically increases complexity because + SDKs must support versions `current - x`. -This follows the same pattern as the Java Cosmos SDK, which exploits the fact that Cosmos DB -data has a composite sort order `(query_sort_order, partition_key_range_id)` to generate -efficient range filters for partitions that don't have stored continuation tokens. +2. **Versioned** — Tokens carry a version field. Revving the version is the option of last + resort. New `ResumeState` variants can be added without changing the version, because + `serde`'s tagged enum deserialization handles unknown variants gracefully (they fail to + parse, which is the correct behavior when an older SDK encounters a token from a newer one). + +3. **Aim for O(1) size** — Token size should ideally be constant regardless of partition + count. For ReadAll, only the state of the currently-active partition is stored, and other + partitions' positions are reconstructed from EPK bounds on resume. However, per-partition + state MAY become necessary for certain node types (e.g., change feed requires per-range + tokens). It is up to each node type to define its own resume state and thus determine + the size of that state. + +4. **Composable** — Each node type defines its own `ResumeState` variant. New node types + add new variants without breaking the token structure for existing node types. The resume + state is extensible via serde's tagged enum — unknown variants from newer SDKs fail to + deserialize correctly in older SDKs. + +5. **Operation-bound** — Tokens include an operation kind to prevent replaying a token from + one operation type against a different operation on the same container. ### 7.2 Token Structure @@ -919,16 +856,6 @@ efficient range filters for partitions that don't have stored continuation token /// Opaque to callers. Serializes to a string via `Display` and /// deserializes via `FromStr`. The internal representation is /// versioned and validated on deserialization. -/// -/// The token mirrors the plan's step graph as a **nested** structure: -/// each pipeline stage wraps the continuation state of its children. -/// This means each layer can interpret its children's state in context -/// — for example, an `OrderBy` node knows how to generate filters for -/// the `Fetch` nodes it wraps, without the Fetch nodes needing to be -/// aware of ORDER BY semantics. -/// -/// On resume, the Planner walks the nested token top-down, matching -/// each layer to the corresponding step in the re-created plan. #[derive(Clone, Debug)] pub struct ContinuationToken { inner: ContinuationTokenInner, @@ -936,189 +863,135 @@ pub struct ContinuationToken { /// Internal token representation (not public). #[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] struct ContinuationTokenInner { /// Token format version for forward/backward compatibility. version: u32, /// Container identity (RID, not name) to detect container recreation. - #[serde(rename = "containerRid")] container_rid: String, - /// The nested resume state, rooted at the plan's output step. - /// Each layer wraps the state of its child steps. + /// The operation kind this token was produced for. + /// Prevents replaying tokens across incompatible operations. + operation_kind: String, + + /// The resume state, defined by the node type that produced it. resume: ResumeState, } +``` -/// Nested resume state for a plan step. +```rust +/// Resume state for a plan node. /// -/// Each variant captures the state for one pipeline stage and embeds -/// its children's state. This forms a tree that mirrors the plan DAG. -/// New variants can be added as new pipeline stages are introduced. +/// Each variant captures the state for one node type. New variants +/// can be added as new node types are introduced, without changing +/// the token version. #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "type")] enum ResumeState { + /// Sequential cross-partition drain. + /// + /// Tracks the current feed range position: `epk_min` and `epk_max` + /// identify the active range, and `server_token` holds the server + /// continuation for that range (if mid-partition). + /// + /// On resume, ranges with max ≤ `epk_min` are skipped (already drained). + /// The range matching `[epk_min, epk_max)` resumes from `server_token`. + /// Ranges after `epk_max` start fresh. + #[serde(rename = "drain")] + Drain(DrainState), + /// A single partition fetch, mid-stream or just completed. - /// This is a leaf node — it has no children. + /// Used as the root resume state for single-partition feed operations. #[serde(rename = "fetch")] - Fetch { - /// EPK min inclusive of the target range. - min: String, + Fetch(FetchState), - /// EPK max exclusive of the target range. - max: String, + // Future variants (added without changing token version): + // + // /// Change feed — per-range continuation tokens. + // #[serde(rename = "changeFeed")] + // ChangeFeed(ChangeFeedState), + // + // /// Ordered merge for ORDER BY queries. + // #[serde(rename = "orderedMerge")] + // OrderedMerge(OrderedMergeState), +} - /// Server-provided continuation token for this range. - /// Absent when this range was just completed and the cursor - /// is at the boundary to the next range. - #[serde(rename = "serverToken", skip_serializing_if = "Option::is_none")] - server_continuation: Option, - }, +/// Resume state for a Drain node. +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +struct DrainState { + /// EPK minimum of the current active feed range. + /// All ranges with max ≤ this value have been fully drained. + epk_min: String, + + /// EPK maximum of the current active feed range. + epk_max: String, + + /// Server-provided continuation token for this range. + /// `None` when this range was just completed and the cursor + /// is at the boundary to the next range. + #[serde(skip_serializing_if = "Option::is_none")] + server_token: Option, +} - /// An unordered (sequential-drain) merge over partitions. - /// Wraps the child `Fetch` that was active when the token was created. - /// On resume, partitions left of the child are skipped, the child - /// resumes from its state, and partitions to the right start fresh. - #[serde(rename = "drain")] - Drain { - /// The resume state of the active child Fetch step. - inner: Box, - }, +/// Resume state for a single-partition Fetch node. +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +struct FetchState { + /// EPK min inclusive of the target range. + epk_min: String, - /// An ordered (k-way merge) over partitions. - /// Wraps the child `Fetch` that last produced results, plus the - /// ORDER BY values needed to generate filters for all other partitions. - #[serde(rename = "orderBy")] - OrderBy { - /// The ORDER BY values of the last document returned. - /// Used to generate range filters for non-target partitions. - #[serde(rename = "lastValues")] - last_order_by_values: Vec, - - /// The `_rid` of the last document returned. - /// Used for duplicate elimination on the target partition. - #[serde(rename = "lastRid")] - last_rid: String, - - /// Whether to include documents matching the last ORDER BY values. - inclusive: bool, - - /// The resume state of the target child Fetch step. - inner: Box, - }, + /// EPK max exclusive of the target range. + epk_max: String, - // Future variants: - // - // /// An offset/limit stage wrapping an inner pipeline. - // #[serde(rename = "offsetLimit")] - // OffsetLimit { - // skipped: u64, - // returned: u64, - // inner: Box, - // }, + /// Server-provided continuation token for this range. + #[serde(skip_serializing_if = "Option::is_none")] + server_token: Option, } ``` -The nesting means each layer owns the interpretation of its children. An `OrderBy` node -knows the `Fetch` inside it is the target partition, and uses `lastValues`/`lastRid` to -generate filters for the other partitions. A `Drain` node knows the `Fetch` inside it is -the cursor position, and partitions left/right of it are skipped/fresh. Neither the `Fetch` -node nor the Planner need to cross-reference sibling state. - #### Wire-format field reference | Rust type | Field | Wire key | Content | |-----------|-------|----------|---------| | `ContinuationTokenInner` | `version` | `version` | Format version (integer) | | | `container_rid` | `containerRid` | Container RID (string) | -| | `resume` | `resume` | Nested `ResumeState` (root of tree) | -| `ResumeState::Fetch` | *(tag)* | `type` | `"fetch"` | -| | `min` | `min` | EPK min inclusive (hex string) | -| | `max` | `max` | EPK max exclusive (hex string) | -| | `server_continuation` | `serverToken` | Server continuation (omitted if null) | -| `ResumeState::Drain` | *(tag)* | `type` | `"drain"` | -| | `inner` | `inner` | Child `ResumeState` | -| `ResumeState::OrderBy` | *(tag)* | `type` | `"orderBy"` | -| | `last_order_by_values` | `lastValues` | Last ORDER BY values (array) | -| | `last_rid` | `lastRid` | Last document `_rid` (string) | -| | `inclusive` | `inclusive` | Include matching values (bool) | -| | `inner` | `inner` | Child `ResumeState` | +| | `operation_kind` | `operationKind` | Operation kind (e.g., `"readAll"`) | +| | `resume` | `resume` | `ResumeState` (tagged union) | +| `DrainState` | *(tag)* | `type` | `"drain"` | +| | `epk_min` | `epkMin` | EPK min inclusive (hex string) | +| | `epk_max` | `epkMax` | EPK max exclusive (hex string) | +| | `server_token` | `serverToken` | Server continuation (omitted if null) | +| `FetchState` | *(tag)* | `type` | `"fetch"` | +| | `epk_min` | `epkMin` | EPK min inclusive (hex string) | +| | `epk_max` | `epkMax` | EPK max exclusive (hex string) | +| | `server_token` | `serverToken` | Server continuation (omitted if null) | ### 7.3 Resume Strategy -On resume, the Planner walks the nested `ResumeState` tree top-down, matching each layer to -the corresponding step in the re-created plan. Each layer interprets its own state and its -child's state in context: +On resume, the Planner validates the token and uses the resume state to reconstruct the +plan at the correct position. -#### `Drain` (unordered cross-partition) +#### `Drain` (sequential cross-partition) -The `Drain` node wraps a `Fetch` child representing the cursor position. On resume: +The `DrainState` tracks the cursor position via EPK bounds. On resume: | Partition position | Action | |--------------------|--------| -| **Left of child** (EPK max ≤ child's min) | Skip — already drained. | -| **Child range** (matches child's EPK bounds) | Resume using child's `serverToken`. | -| **Right of child** (EPK min ≥ child's max) | Start fresh (not yet visited). | - -If the child's range has split, `PartitionMapper` uses the EPK bounds to assign the server -continuation to the appropriate child range(s). +| **Left of active** (range max ≤ `epk_min`) | Skip — already drained. | +| **Active range** (matches `[epk_min, epk_max)`) | Resume using `server_token`. If `server_token` is `None`, the range is complete — skip it and start the next range fresh. | +| **Right of active** (range min ≥ `epk_max`) | Start fresh (not yet visited). | -#### `OrderBy` (ordered cross-partition) - -The `OrderBy` node wraps a `Fetch` child (the target partition) and carries `lastValues` / -`lastRid` for filter generation. On resume: - -| Partition position | Generated filter | Rationale | -|--------------------|-----------------|-----------| -| **Left of child** | ORDER BY values **strictly past** `lastValues` | May have remaining items, but only those after the resume point. | -| **Child range** | Server continuation + ORDER BY values **at or past** `lastValues` | Resume exactly where we stopped. | -| **Right of child** | ORDER BY values **at or past** `lastValues` | Haven't fully explored these yet. | - -Duplicate elimination: on the child partition, documents with the same ORDER BY values as -`lastValues` but `_rid ≤ lastRid` have already been returned and are filtered out. +If the active range has split since the token was created, the Planner uses the EPK bounds +to assign the server continuation to the appropriate child range. The `server_token` applies +to the first sub-range that overlaps the original EPK bounds; subsequent sub-ranges start +fresh. #### `Fetch` (leaf — single partition) -A bare `Fetch` at the root (no wrapping `Drain` or `OrderBy`) represents a single-partition -operation. Resume uses `serverToken` directly. - -#### Nesting composes naturally - -Future pipeline stages wrap their children the same way: - -```text -OffsetLimit { skipped: 50, returned: 20, - inner: OrderBy { lastValues: ["Baker"], lastRid: "abc", inclusive: true, - inner: Fetch { min: "55", max: "AA", serverToken: "..." } - } -} -``` - -Each layer reads only its own fields plus `inner`. No layer needs to inspect sibling or -grandchild state. - -#### Mapping `ResumeState` back to `PlanStep` - -The `ResumeState` tree does not map 1:1 to `PlanStep` variants — it maps to the **Planner's -reconstruction logic**: - -| `ResumeState` | Effect on plan | -|---------------|----------------| -| `Fetch` | Sets `PlanStep::Fetch.continuation` to the stored `serverToken`. The EPK bounds identify which `Fetch` step in the plan to target. | -| `Drain` | The Planner uses the child `Fetch`'s EPK bounds to determine which partition was active, skips partitions left of it, and starts right partitions fresh. The `UnorderedMerge` step itself is stateless. | -| `OrderBy` | The Planner generates a `resume_filter` string from `lastValues` and sets it on each `Fetch` step. The child `Fetch`'s `continuation` is also restored. Duplicate elimination state (`lastRid`, `inclusive`) is applied at the executor level. | - -**Filter injection for ORDER BY queries:** The backend query plan provides a rewritten query -containing the `{documentdb-formattableorderbyquery-filter}` placeholder token. The `Fetch` -step's `operation` holds this **unrewritten** query text. At execution time, the executor -replaces the placeholder with the `resume_filter` via simple string substitution. This means: - -- On **first page** (no continuation): the placeholder is replaced with `"true"` (no filter). -- On **resume**: the Planner computes the filter expression from the `OrderBy` resume state - (e.g., `"c.name > 'Baker'"` for left-of-target partitions, `"c.name >= 'Baker'"` for the - target and right-of-target) and sets it as `resume_filter` on each `Fetch` step. - -This approach keeps the `Fetch` step generic — it doesn't need to understand ORDER BY -semantics, just string substitution on a known placeholder. +A bare `FetchState` at the root (no wrapping `Drain`) represents a single-partition operation. +Resume uses `server_token` directly. ### 7.4 Serialization @@ -1157,43 +1030,37 @@ impl FromStr for ContinuationToken { #### Sample Tokens -**Unordered cross-partition query, mid-stream on partition ["55","AA")** - -A `Drain` wraps the active `Fetch`: +**ReadAll, mid-stream on partition ["55","AA")** JSON (before base64 encoding): ```json { - "version": 2, + "version": 1, "containerRid": "dbs/abc/colls/def", + "operationKind": "readAll", "resume": { "type": "drain", - "inner": { - "type": "fetch", - "min": "55", - "max": "AA", - "serverToken": "+RID:~abc123#RT:1#TRC:10#ISV:2#IEO:65551" - } + "epkMin": "55", + "epkMax": "AA", + "serverToken": "+RID:~abc123#RT:1#TRC:10#ISV:2#IEO:65551" } } ``` -On resume, the `Drain` sees its child targets `["55","AA")`. Partitions left of `"55"` are -skipped, the target resumes from `serverToken`, and partitions right of `"AA"` start fresh. +On resume, the Planner sees the drain cursor at `["55","AA")`. Ranges with max ≤ `"55"` are +skipped. The range `["55","AA")` resumes from `serverToken`. Ranges after `"AA"` start fresh. -**Unordered query, target partition just completed (cursor at boundary)** +**ReadAll, target partition just completed (cursor at boundary)** ```json { - "version": 2, + "version": 1, "containerRid": "dbs/abc/colls/def", + "operationKind": "readAll", "resume": { "type": "drain", - "inner": { - "type": "fetch", - "min": "55", - "max": "AA" - } + "epkMin": "55", + "epkMax": "AA" } } ``` @@ -1201,105 +1068,57 @@ skipped, the target resumes from `serverToken`, and partitions right of `"AA"` s `serverToken` is absent, meaning partition `["55","AA")` is fully drained. The Planner skips everything up to and including this range, and starts the next partition fresh. -**Single-partition query, mid-stream** +**Single-partition feed, mid-stream** -A bare `Fetch` at the root (no wrapping layer): +A bare `FetchState` at the root (no wrapping layer): ```json { - "version": 2, + "version": 1, "containerRid": "dbs/abc/colls/def", + "operationKind": "readAll", "resume": { "type": "fetch", - "min": "55", - "max": "AA", + "epkMin": "55", + "epkMax": "AA", "serverToken": "-RID:QmFzZTY0#RT:3#TRC:50" } } ``` -**ORDER BY cross-partition query, `ORDER BY c.name ASC`** - -An `OrderBy` wraps the target `Fetch`, carrying the last returned document's sort values: - -```json -{ - "version": 2, - "containerRid": "dbs/abc/colls/def", - "resume": { - "type": "orderBy", - "lastValues": ["Baker"], - "lastRid": "R3JlYXQ", - "inclusive": true, - "inner": { - "type": "fetch", - "min": "55", - "max": "AA", - "serverToken": "+RID:~abc456#RT:2#TRC:5#ISV:2#IEO:65551" - } - } -} -``` - -On resume, the `OrderBy` layer generates partition filters from `lastValues`: -- Partitions left of `"55"`: filter `c.name > 'Baker'` (strictly past). -- Target `["55","AA")`: resume from `serverToken`, filter `c.name >= 'Baker'`, - deduplicate items with `_rid ≤ "R3JlYXQ"`. -- Partitions right of `"AA"`: filter `c.name >= 'Baker'`. - -**Compound ORDER BY, `ORDER BY c.name ASC, c.age DESC`** - -```json -{ - "version": 2, - "containerRid": "dbs/abc/colls/def", - "resume": { - "type": "orderBy", - "lastValues": ["Baker", 42], - "lastRid": "UmVzdW1l", - "inclusive": true, - "inner": { - "type": "fetch", - "min": "AA", - "max": "FF", - "serverToken": "+RID:~abc789#RT:1#TRC:3#ISV:2" - } - } -} -``` - -The `lastValues` array contains one entry per ORDER BY column, in declaration order. - ### 7.5 Compatibility Contract A continuation token is **invalidated** by: 1. **Container recreation** — The token's `containerRid` won't match the new container's RID. 2. **Token version mismatch** — A token produced by a newer SDK version may not be readable - by an older version. -3. **Structure mismatch** — If the re-created plan produces a different step graph shape - than the token's nested `ResumeState` (e.g., the operation changed, or the plan type - differs), the token is rejected. + by an older version. Newer SDKs MUST support tokens from older versions (backward compat). +3. **Operation kind mismatch** — The token's `operationKind` must match the operation being + resumed. A `readAll` token cannot be used with a query operation. +4. **Structure mismatch** — If the re-created plan produces a different node type than the + token's `ResumeState` variant (e.g., a `drain` token for a single-partition operation), + the token is rejected. A continuation token **survives**: -1. **Partition splits** — The token stores EPK bounds, not just PK range IDs. On resume, the +1. **Partition splits** — The token stores EPK bounds, not PK range IDs. On resume, the Planner re-resolves EPK bounds to current PK range IDs. 2. **SDK version upgrades** — The token is versioned. Older token versions are supported by newer SDKs (backward compatible deserialization). 3. **Process boundaries** — The token is a self-contained string, safe to send to a browser and back. +4. **Durable storage** — Tokens can be stored in databases and used across process restarts, + machine migrations, and SDK upgrades. ### 7.6 What the Token Does NOT Encode -- **Per-range state for all partitions** — Only the active Fetch step's state is stored. - Other partitions' positions are reconstructed via query filter rewriting on resume. +- **Per-range state for all partitions (for Drain)** — Only the active range's state is + stored. Other partitions' positions are reconstructed from the EPK bounds on resume. Other + node types may store per-range state if needed (see §12.3 Change Feed). - **Query text or parameters** — The caller must provide an equivalent `CosmosOperation`. - **Session tokens** — Session consistency is not preserved across process boundaries via - the continuation token. The driver resolves session tokens from the `SessionManager` cache - for each turn independently. -- **Container name or database name** — Only the RID is stored. The caller provides routing - context via the `CosmosOperation`. + the continuation token. +- **Container name or database name** — Only the RID is stored. - **PK range IDs** — Only EPK bounds are stored, which are stable across partition splits. PK range IDs are resolved dynamically from the `PartitionKeyRangeCache` on resume. @@ -1311,259 +1130,257 @@ A continuation token **survives**: The driver does **not** create OpenTelemetry spans or any other telemetry artifacts. Instead, each call to `execute_operation` returns a `DiagnosticsContext` on the `CosmosResponse` -containing a structured hierarchy of timing, concurrency, and request data. The higher-level -SDK crate uses this data to create OTEL spans, log entries, or any other telemetry it chooses. +containing a structured hierarchy of timing and request data. The higher-level SDK crate uses +this data to create OTEL spans, log entries, or any other telemetry it chooses. This separation ensures the driver remains transport- and telemetry-agnostic while providing enough detail for the SDK to reconstruct the full execution timeline. -### 8.2 Hierarchy: Turn → Step → Request +### 8.2 Hierarchy: Plan → Node → Request -Each `execute_operation` call executes one **Turn** of an operation plan. A Turn contains -one or more **Steps** (one per plan step executed), and each Step contains zero or more -**Requests** (the existing `RequestDiagnostics` type, unchanged). +Each `execute_operation` call produces a `DiagnosticsContext` with a hierarchical view of the +operation plan's execution. The hierarchy mirrors the plan graph: composite nodes (Drain) +contain child node diagnostics, and leaf nodes (Fetch) contain HTTP request diagnostics. ```text DiagnosticsContext - └── TurnDiagnostics - ├── duration, total RU, concurrency metadata - │ - ├── StepDiagnostics [0] (e.g., Fetch to PK range "0") - │ ├── enqueued_at, started_at, completed_at - │ ├── step type, EPK range - │ └── RequestDiagnostics [0] (initial attempt) - │ RequestDiagnostics [1] (retry, if any) + ├── activityId, totalDurationMs, totalRequestCharge + │ + └── operationPlan (NodeDiagnostics) + ├── nodeType: "drain" + ├── startedAt, completedAt, durationMs │ - ├── StepDiagnostics [1] (e.g., Fetch to PK range "1") - │ ├── enqueued_at, started_at, completed_at - │ └── RequestDiagnostics [0] - │ - └── StepDiagnostics [2] (e.g., UnorderedMerge) - ├── started_at, completed_at - └── (no requests — local computation only) + └── children[] + └── [0] NodeDiagnostics + ├── nodeType: "fetch" + ├── epkRange: { min, max } + ├── startedAt, completedAt, durationMs + ├── requestCharge + ├── outcome: "success" | "failed" + │ + ├── requests[] + │ ├── [0] RequestDiagnostics (initial attempt) + │ └── [1] RequestDiagnostics (retry, if any) + │ + └── children[] (empty for Fetch) ``` -For point operations, the Turn has exactly one Step with one or more Requests (retries). -The hierarchy is always present but trivially flat. - -### 8.3 `TurnDiagnostics` +Every node holds a list of diagnostics from the child nodes it triggered (`children`), +as well as its own HTTP requests. This makes the diagnostics structure recursive and +directly mirrors the plan graph. -```rust -/// Diagnostics for a single turn (one page) of an operation. -/// -/// Each call to `execute_operation` produces exactly one `TurnDiagnostics`. -/// For paginated feed operations, the SDK aggregates multiple turns' diagnostics -/// across pages. -pub struct TurnDiagnostics { - /// Wall-clock time when this turn started. - /// - /// Provides an anchor for converting `Instant` timestamps (used in - /// `StepDiagnostics`) to `SystemTime` for OTEL spans or other - /// wall-clock-based telemetry. The SDK can compute a step's wall-clock - /// start as `wall_clock_start + (step.started_at - start_instant)`. - wall_clock_start: SystemTime, +For point operations (SingleNode plan), the hierarchy collapses: the `operationPlan` +is a single Fetch node with its requests and no children. The existing flat `requests()` +accessor is preserved for backward compatibility by flattening the tree. - /// Monotonic timestamp when this turn started. - /// - /// Used as the reference point for computing wall-clock times from - /// step-level `Instant` timestamps: for any step `Instant` value `i`, - /// the wall-clock time is `wall_clock_start + (i - start_instant)`. - start_instant: Instant, - - /// Wall-clock duration of the entire turn. - duration: Duration, - - /// Total RU charge across all steps and requests in this turn. - total_request_charge: RequestCharge, - - /// Per-step diagnostics, in execution order. - steps: Vec, - - /// Concurrency metadata for this turn. - concurrency: TurnConcurrency, -} - -/// Concurrency metadata for a turn. -/// -/// Enables the SDK to observe how steps were parallelized and whether the -/// concurrency cap was a bottleneck. Wait times and max concurrency can -/// be computed from the step timestamps by the SDK if needed. -pub struct TurnConcurrency { - /// Total number of steps executed in this turn. - steps_executed: usize, - - /// The concurrency cap that was configured for this turn. - /// Steps beyond this limit waited for a permit before starting. - concurrency_cap: usize, -} -``` - -### 8.4 `StepDiagnostics` +### 8.3 Hierarchical Diagnostics Types ```rust -/// Diagnostics for a single step within a turn. -/// -/// Captures three timestamps to distinguish **wait time** (waiting for a -/// concurrency permit) from **execution time** (actually performing the -/// step's work). These durations can be trivially computed by the SDK: -/// -/// ```text -/// enqueued_at started_at completed_at -/// │── wait time ──│── execution time ──│ -/// (started_at - (completed_at - -/// enqueued_at) started_at) -/// ``` +/// Diagnostics for a single plan node's execution. /// -/// For steps that don't go through the concurrency semaphore (e.g., Merge), -/// `enqueued_at == started_at` (zero wait time). -pub struct StepDiagnostics { - /// What kind of step this was. - step_type: StepType, - - /// The EPK range targeted by this step (for Fetch steps). - /// `None` for non-fetch steps (Merge, etc.). +/// This type is recursive: composite nodes (Drain) contain child +/// `NodeDiagnostics` entries, mirroring the plan graph structure. +pub struct NodeDiagnostics { + /// What kind of node this was. + node_type: NodeType, + + /// The EPK range targeted by this node (for Fetch nodes). + /// `None` for non-fetch nodes. epk_range: Option>, - /// When the step was enqueued for execution (requested a concurrency permit). - enqueued_at: Instant, - - /// When the step started executing (acquired its concurrency permit). + /// When the node started executing. started_at: Instant, - /// When the step completed. + /// When the node completed. completed_at: Instant, - /// Total RU charge for this step. + /// Duration in milliseconds. + duration_ms: u64, + + /// Total RU charge for this node (including children). request_charge: RequestCharge, - /// Individual HTTP request diagnostics for this step. - /// Empty for non-HTTP steps (e.g., Merge). - /// May contain multiple entries due to retries within the step. + /// Individual HTTP request diagnostics for this node. + /// Empty for non-leaf nodes that don't directly issue HTTP requests. + /// May contain multiple entries due to retries within the node. requests: Vec, - /// Outcome of this step's execution. - /// - /// For Fetch steps, the outcome is typically captured in the - /// `RequestDiagnostics`. This field captures outcomes for non-HTTP - /// steps (e.g., Merge failures) and provides a summary for all - /// step types without requiring callers to inspect nested requests. - outcome: StepOutcome, + /// Child node diagnostics, for composite nodes (Drain, future merge nodes). + /// Empty for leaf nodes (Fetch). + /// For Drain, contains only the nodes that were executed in this call + /// (typically one Fetch node per page). + children: Vec, + + /// Outcome of this node's execution. + outcome: NodeOutcome, } -/// Outcome of a plan step's execution. +/// Outcome of a plan node's execution. #[derive(Clone, Debug)] -pub enum StepOutcome { - /// The step completed successfully. +pub enum NodeOutcome { + /// The node completed successfully. Success, - /// The step failed with an error. - /// The message is a brief summary (not a full stack trace). + /// The node failed with an error. Failed { message: String }, } -/// Identifies the kind of plan step for diagnostics purposes. +/// Identifies the kind of plan node for diagnostics purposes. #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum StepType { - /// A Fetch step that executed an HTTP request via execute_single_operation. +#[serde(rename_all = "camelCase")] +pub enum NodeType { + /// A Fetch node that executed an HTTP request via execute_single_operation. Fetch, - /// An UnorderedMerge step that concatenated results from upstream steps. - UnorderedMerge, - // Future: OrderedMerge, OffsetLimit, etc. + /// A Drain node that sequentially processes partitions. + Drain, + // Future: UnorderedMerge, OrderedMerge, Aggregate, etc. } ``` -### 8.5 Collection Approach +### 8.4 JSON Representation -The `PlanExecutor` records timestamps at key points during execution: +Diagnostics serialize to a hierarchical JSON structure with consistent camelCase property +names: -1. **Step enqueued** (`enqueued_at`): Recorded when the executor submits a step for - execution. For concurrent fan-out, this is when the step requests a permit from the - concurrency semaphore. - -2. **Step started** (`started_at`): Recorded when the step acquires its concurrency permit - and begins executing. For steps that don't use the semaphore (single-step plans, Merge - steps), this equals `enqueued_at`. - -3. **Step completed** (`completed_at`): Recorded when the step finishes (successfully or - with an error). For Fetch steps, this is after `execute_single_operation` returns - (including any retries it performs internally). +```json +{ + "activityId": "e4b2c1d8-...", + "totalDurationMs": 42, + "totalRequestCharge": 5.23, + "requestCount": 1, + "operationPlan": { + "nodeType": "drain", + "startedAt": 0, + "completedAt": 42, + "durationMs": 42, + "requestCharge": 5.23, + "outcome": "success", + "requests": [], + "children": [ + { + "nodeType": "fetch", + "epkRange": { "min": "00", "max": "55" }, + "startedAt": 0, + "completedAt": 15, + "durationMs": 15, + "requestCharge": 5.23, + "outcome": "success", + "requests": [ + { + "executionContext": "initial", + "pipelineType": "dataPlane", + "transportSecurity": "secure", + "transportKind": "gateway", + "transportHttpVersion": "http2", + "region": "westus2", + "endpoint": "https://myaccount.documents.azure.com/", + "status": "200", + "requestCharge": 5.23, + "activityId": "e4b2c1d8-...", + "serverDurationMs": 3.2, + "durationMs": 15, + "events": [ + { "eventType": "transportStart", "durationMs": null }, + { "eventType": "responseHeadersReceived", "durationMs": 12 }, + { "eventType": "transportComplete", "durationMs": 15 } + ], + "timedOut": false, + "requestSent": "sent", + "error": null + } + ], + "children": [] + } + ] + } +} +``` -4. **Derived values**: The SDK can compute wait time (`started_at - enqueued_at`), - execution time (`completed_at - started_at`), max concurrent steps (from overlapping - intervals), and total wait time (sum across steps) from the raw timestamps. The driver - stores only the timestamps to minimize memory. +For point operations, the structure is similar but with a single Fetch node and no wrapping +Drain: -All timestamps use `Instant::now()` — cheap and monotonic. No allocations beyond the -`Vec` that is already needed for the diagnostics output. No derived -`Duration` fields are stored — the SDK computes them on demand. +```json +{ + "activityId": "a1b2c3d4-...", + "totalDurationMs": 8, + "totalRequestCharge": 1.0, + "requestCount": 1, + "operationPlan": { + "nodeType": "fetch", + "epkRange": null, + "durationMs": 8, + "requestCharge": 1.0, + "outcome": "success", + "requests": [{ "..." : "..." }], + "children": [] + } +} +``` -### 8.6 `DiagnosticsContext` Changes +### 8.5 Alignment with Existing `DiagnosticsContext` -The existing `DiagnosticsContext` gains a `TurnDiagnostics` field. The flat -`requests: Arc>` is replaced by the nested structure, but a -backward-compatible `requests()` accessor is preserved by flattening the tree: +The existing `DiagnosticsContext` type (in `diagnostics_context.rs`) currently uses a flat +`requests: Arc>` structure. The feed operations change adds the +hierarchical `operationPlan` field while preserving backward compatibility: ```rust impl DiagnosticsContext { - /// Returns the turn diagnostics for this operation. - pub fn turn(&self) -> &TurnDiagnostics { ... } + /// Returns the plan diagnostics for this operation. + pub fn operation_plan(&self) -> &NodeDiagnostics { ... } - /// Returns all HTTP request diagnostics, flattened across steps. + /// Returns all HTTP request diagnostics, flattened across nodes. /// /// This is backward-compatible with the pre-feed-operations API. /// Requests are returned in the order they were executed. pub fn requests(&self) -> Arc> { - // Flatten: turn.steps.iter().flat_map(|s| s.requests.iter()) + // Flatten: recursively collect requests from the node tree. } } ``` -The `DiagnosticsContextBuilder` gains step-tracking methods: +The `DiagnosticsContextBuilder` gains node-tracking methods: ```rust impl DiagnosticsContextBuilder { - /// Records that a step has been enqueued for execution. - pub(crate) fn enqueue_step(&mut self, step_type: StepType) -> StepHandle { ... } - - /// Records that a step has started executing (acquired concurrency permit). - pub(crate) fn start_step(&mut self, handle: &StepHandle) { ... } + /// Records that a node has started executing. + pub(crate) fn start_node( + &mut self, + node_type: NodeType, + epk_range: Option>, + ) -> NodeHandle { ... } - /// Records that a step has completed, with its requests. - pub(crate) fn complete_step( + /// Records that a node has completed, with its requests and children. + pub(crate) fn complete_node( &mut self, - handle: StepHandle, + handle: NodeHandle, requests: Vec, + children: Vec, + outcome: NodeOutcome, ) { ... } } ``` -### 8.7 Granularity Control +### 8.6 Verbosity Control -The existing `DiagnosticsVerbosity` enum (Summary / Detailed) controls how the Turn/Step -tree is serialized: +The existing `DiagnosticsVerbosity` enum (Summary / Detailed) controls serialization: | Verbosity | Behavior | |-----------|----------| -| **Summary** | Step-level timing is included but per-step wait times may be omitted. Individual `RequestDiagnostics` are deduplicated/aggregated as they are today. Concurrency metadata is included (a few integers). | -| **Detailed** | Full tree: all step timestamps (enqueued/started/completed), all individual `RequestDiagnostics` with events, and concurrency metadata. | - -Point operations produce the same output as today at both verbosity levels — the Turn/Step -nesting is transparent when there's only one step. +| **Summary** | Node-level timing included. Individual `RequestDiagnostics` are deduplicated/aggregated as they are today. | +| **Detailed** | Full tree: all node timestamps, all individual `RequestDiagnostics` with events, all children. | -### 8.8 Pagination Context +Point operations produce the same output as today at both verbosity levels — the hierarchy +is transparent when there's only one node. -Each `execute_operation` call produces one `DiagnosticsContext` containing one Turn. The -SDK layer manages pagination and can: +### 8.7 Pagination Context -1. **Aggregate Turns** — collect `TurnDiagnostics` from multiple pages to produce a - summary of the full pagination operation (total RU, total duration, pages fetched). +Each `execute_operation` call produces one `DiagnosticsContext`. The SDK layer manages +pagination and can: -2. **Correlate across pages** — the continuation token can optionally carry a - `feed_operation_id` (UUID) so the SDK can link diagnostics from different - `execute_operation` calls that belong to the same logical feed operation. +1. **Aggregate across pages** — collect diagnostics from multiple pages to produce a + summary of the full feed operation (total RU, total duration, pages fetched). -3. **Create OTEL spans** — the SDK can create a parent span for the feed operation, - child spans for each Turn, and nested spans for each Step, using the timestamps +2. **Create OTEL spans** — the SDK can create a parent span for the feed operation, + child spans for each page, and nested spans for each node, using the timestamps and metadata from the diagnostics tree. The driver does not prescribe span structure — it provides the data. @@ -1573,46 +1390,36 @@ SDK layer manages pagination and can: ### 9.1 Partition Split During Execution -Fetch steps target **EPK ranges**, not PK range IDs. When a Fetch step receives a 410/1002 -(Gone — PartitionKeyRangeGone) response: +Fetch nodes target **EPK ranges**, not PK range IDs. When a Fetch node receives a 410/1002 +(Gone — PartitionKeyRangeGone) response, the Fetch node handles the split **internally**: 1. **Invalidate** the `PartitionKeyRangeCache` for the affected container. 2. **Re-fetch** the partition key ranges. -3. **Re-resolve** the Fetch step's EPK range to the new child PK range IDs. The step's EPK - range now maps to multiple PK ranges. -4. **Issue concurrent requests** to all child PK ranges within the step. The plan structure - does not change — the Fetch step internally fans out. There is no requirement that the - concurrency semaphore issues one permit per step; a single Fetch step may hold multiple - concurrent requests after a split. -5. **Resume execution** with the child range results. - -The plan graph remains stable across splits — no steps are added, removed, or rewired. +3. **Re-resolve** the Fetch node's EPK range to the new child PK range IDs. +4. **Internally split** — the single Fetch range issues requests to the appropriate + child PK ranges. +5. **Resume execution** with the child range result. + +The plan structure remains stable across splits. The Fetch node absorbs the split +internally without changing the plan graph. The next time the plan is generated (on the +next page), the Planner will see the new split ranges from the PK range cache and create +separate Fetch nodes for each child range — the continuation token's EPK bounds guide +the resume position correctly. + The continuation token survives because it stores EPK bounds (not PK range IDs), and the -Fetch step re-resolves those bounds to current PK range IDs on each execution. +Planner re-resolves those bounds to current PK range IDs on each page. ### 9.2 Error Propagation | Error Scenario | Behavior | |----------------|----------| -| 410/1002 (PartitionKeyRangeGone) | Fetch step re-resolves EPK range to child PK ranges, retries. | +| 410/1002 (PartitionKeyRangeGone) | Fetch node internally re-resolves EPK range, retries. | | 429 (Throttled) | Handled by transport pipeline (backoff + retry). | | 503 (Service Unavailable) | Handled by operation pipeline (region failover). | | 404 (Not Found) — container | Fail the entire feed operation. | -| 404 (Not Found) — item in ReadMany | Item omitted from results (not an error). | | Transient network error | Handled by transport pipeline (retry). | | Invalid continuation token | Fail with `ErrorKind::DataConversion`. | -### 9.3 Partial Failure in Fan-Out - -For ReadMany and cross-partition queries, if one PK range fails after exhausting all retries -(transport + operation pipeline), the entire feed operation fails. Partial results from -successful ranges are NOT returned. - -**Rationale:** Returning partial results would require the caller to distinguish between -"all items fetched" and "some items fetched, some failed" — a complex API that most callers -don't want. If partial results are needed in the future, they can be exposed via a separate -API or option. - --- ## 10. API Semantics & Invariants @@ -1620,10 +1427,9 @@ API or option. ### 10.1 Public API The driver exposes a single `execute_operation` method for **all** operations — both point -and feed. The driver is stateless across calls: each invocation runs one turn of the plan +and feed. The driver is stateless across calls: each invocation runs one page of the plan and returns a `CosmosResponse`. The response optionally includes a continuation token when -more pages are available. The higher-level SDK (e.g., `azure_data_cosmos`) decides which -operations to surface as pagers from a UX perspective. +more pages are available. ```rust impl CosmosDriver { @@ -1632,20 +1438,16 @@ impl CosmosDriver { /// For point operations (read, create, delete, etc.), this returns the /// single response with no continuation token. /// - /// For feed operations (query, read-many, read-all), this executes one - /// turn of the plan and returns a page of results. If more pages are - /// available, the response includes a `ContinuationToken`. The caller - /// passes this token back in `OperationOptions` to fetch the next page. - /// - /// The driver does not manage pagination state — it acts as a stateless - /// service. The SDK layer is responsible for threading continuation tokens - /// across calls to implement pagers/streams. + /// For feed operations (read-all), this executes one page of the plan + /// and returns the result. If more pages are available, the response + /// includes a `ContinuationToken`. The caller passes this token back + /// in `OperationOptions` to fetch the next page. pub async fn execute_operation( &self, operation: CosmosOperation, options: OperationOptions, ) -> azure_core::Result { - // Plan → Execute one turn → return CosmosResponse + // Plan → Execute one page → return CosmosResponse } } ``` @@ -1688,7 +1490,7 @@ impl CosmosResponse { ### 10.3 OperationOptions Changes -`OperationOptions` gains feed-specific fields for continuation and concurrency: +`OperationOptions` gains feed-specific fields: ```rust pub struct OperationOptions { @@ -1698,10 +1500,6 @@ pub struct OperationOptions { /// If not set, the server default applies. max_item_count: Option, - /// Maximum number of concurrent partition key range fetches - /// (feed operations only). Default: min(num_pk_ranges, 10). - max_concurrency: Option, - /// Continuation token for resuming a previous feed operation. /// Pass the token from a previous `CosmosResponse::continuation_token()`. continuation: Option, @@ -1714,200 +1512,142 @@ These fields are ignored for point operations. | Operation | Order Guarantee | |-----------|-----------------| -| ReadMany | Unordered across partitions. Within each partition, (PartitionKey, ID) ascending. | -| Single-partition query | Server-determined order: (PartitionKey, ID) ascending, or as specified by ORDER BY. | -| Cross-partition query (no ORDER BY) | Within each partition, (PartitionKey, ID) ascending. Across partitions, unordered (partition results are concatenated by `UnorderedMerge`). | -| Cross-partition query (ORDER BY) | Globally ordered per ORDER BY clause (future work: `OrderedMerge` k-way merge). | -| ReadFeed (single partition) | (PartitionKey, ID) ascending. | -| ReadFeed (cross-partition) | Within each partition, (PartitionKey, ID) ascending. Across partitions, unordered. | +| ReadAll (single partition) | (PartitionKey, ID) ascending. | +| ReadAll (cross-partition) | Within each partition, (PartitionKey, ID) ascending. Across partitions, items are yielded in EPK order (implementation behavior, not a service guarantee). | ### 10.5 Page Boundaries -Page boundaries are determined by: +Each `execute_operation` call for ReadAll returns exactly one page from exactly one partition: + - **Server-side max item count**: The server may return fewer items than requested. - **Client-side max item count**: Configurable via `OperationOptions::max_item_count`. - **Server continuation**: A page boundary occurs whenever the server returns a continuation token. +- **Partition boundary**: When a partition is fully drained (no server continuation), the + current page is returned. The next call starts the next partition. -For ReadMany, there is exactly one logical page (the merged result), regardless of how many -server-side pages were consumed internally. - ---- - -## 11. Configuration Surface - -### 11.1 OperationOptions Additions - -Feed-specific options are added to `OperationOptions` (see §10.3). They are ignored for -point operations. The existing layered resolution applies: - -1. `OperationOptions` (per-call) -2. `DriverOptions` (per-driver) -3. `CosmosDriverRuntime` (global) -4. Environment variables - -The `max_concurrency`, `max_item_count`, and `continuation` fields follow the same precedence. +Pages never span partition boundaries. --- -## 12. Performance & Non-Regression - -### 12.1 Point Operation Overhead - -The plan model MUST NOT regress point operation performance. Requirements: - -- **No heap allocation** for trivial plans beyond what `execute_operation` does today. -- **No additional async machinery** (no spawning, no channels) for single-step plans. -- **Benchmark**: Point operation latency with the plan model must be within 1% of the - current direct `execute_single_operation` call. +## 11. Testing Strategy -Implementation: For point operations and single-partition feeds, the Planner produces an -`OperationPlan::Trivial` — a stack-allocated single step with no `Vec` overhead. The -executor matches on `Trivial` and calls `execute_single_operation` directly with no -graph traversal. The plan model is only heap-allocated for multi-step fan-out operations. - -### 12.2 Fan-Out Memory Bounds - -For ReadMany: -- Buffered data is bounded by the total size of all items in the response. -- The executor does not buffer more than `max_concurrency` in-flight requests. - -For paginated queries: -- Each turn buffers at most one page per in-flight partition fetch. -- Total buffer: `max_concurrency × max_page_size`. - ---- - -## 13. Migration Plan - -### Phase 1: OperationType / OperationPayload Refactor - -1. Add `OperationPayload` enum. -2. Add `OperationTarget` enum. -3. Update `CosmosOperation` to use `OperationPayload` and `OperationTarget`. -4. Update factory methods. -5. Update transport pipeline request builder to extract body from `OperationPayload`. -6. Remove `body: Option>` from `CosmosOperation`. -7. Update all callers (driver internals, tests, `azure_data_cosmos` bridge). - -**This is a breaking internal change.** The `body` field and `partition_key` field on -`CosmosOperation` are replaced. All internal callers must be updated. - -### Phase 2: Plan Infrastructure - -1. Implement `OperationPlan`, `PlanStep`. -2. Implement `Planner` with trivial single-step planning (point ops only). -3. Implement `PlanExecutor` for single-step plans. -4. Wire `execute_operation` through Plan → Execute path (with fast-path bypass). -5. Validate no performance regression via benchmarks. - -### Phase 3: ReadMany - -1. Implement ReadMany planning in `Planner`: - - Group items by PK range (via `PartitionKeyRangeCache`). - - Create fan-out `Fetch` steps + `UnorderedMerge` step. -2. Implement `UnorderedMerge` step execution in `PlanExecutor`. -3. Wire `execute_operation` to use Plan → Execute for feed operations. -4. Extend `CosmosResponse` with optional `continuation_token` field. -5. Integration tests with partition splits. - -### Phase 4: Single-Partition Queries - -1. Implement single-partition query planning. -2. Implement paginated execution (continuation threading). -3. Implement `ContinuationToken` serialization. - -### Phase 5: Cross-Partition Queries - -1. Implement query plan fetching in `Planner`. -2. Implement multi-range query planning. -3. Implement incremental page production for unordered queries. - -### Phase 6: Advanced Query Features (Future) - -1. ORDER BY merge-sort. -2. Aggregation. -3. Change feed. - ---- - -## 14. Testing Strategy - -### 14.1 Unit Tests +### 11.1 Unit Tests | Test Area | Cases | |-----------|-------| -| Planner — point ops | Verify trivial single-step plan for each point operation type. | -| Planner — ReadMany | Verify correct grouping by PK range. Items spread across ranges. | -| Planner — single-partition query | Verify single `Fetch` step with correct targeting. | -| PlanExecutor — single step | Execute trivial plan, verify result matches direct pipeline call. | -| PlanExecutor — fan-out | Execute multi-step plan with mock pipeline, verify merge. | -| PlanExecutor — concurrency | Verify concurrency cap is respected (at most N concurrent fetches). | -| ContinuationToken — serialize | Serialize to string, verify output. | +| Planner — point ops | Verify SingleNode plan for each point operation type. | +| Planner — ReadAll | Verify Graph plan with Drain root, correct Fetch children per PK range. | +| Planner — ReadAll resume | Verify resume skips drained partitions, resumes active, starts right fresh. | +| Planner — bottom-up invariant | Verify children always have lower NodeIds than parents. | +| PlanExecutor — single node | Execute SingleNode plan, verify result matches direct pipeline call. | +| PlanExecutor — drain | Execute Drain plan with mock pipeline, verify sequential execution. | +| PlanExecutor — drain page boundary | Verify pages don't span partition boundaries. | +| ContinuationToken — serialize | Serialize to base64url string, verify roundtrip. | | ContinuationToken — deserialize | Deserialize from explicit string, verify result. | | ContinuationToken — version compat | Older version tokens deserialize correctly. | +| ContinuationToken — future version | Token with version > current is rejected. | +| ContinuationToken — operation kind | Token with wrong operation kind is rejected. | | ContinuationToken — split recovery | Token with EPK bounds spanning a split range maps to correct child ranges. | -| ContinuationToken — O(1) size | Token size is constant regardless of partition count (only one Fetch leaf stored). | -| ContinuationToken — Drain resume | Drain node correctly classifies partitions as left/target/right from nested Fetch. | -| ContinuationToken — OrderBy resume | OrderBy node generates correct range filters and dedup state from nested Fetch + lastValues. | -| ContinuationToken — nesting | Nested tokens (e.g., future OffsetLimit wrapping OrderBy wrapping Fetch) round-trip correctly. | +| ContinuationToken — Drain resume | Drain node correctly classifies partitions as left/target/right. | +| ContinuationToken — nesting | Nested tokens round-trip correctly through serialize/deserialize. | +| ContinuationToken — unknown variant | Unknown `ResumeState` type fails gracefully on deserialize. | +| NodeId/NodeRange | Verify range iteration, length, empty checks. | | OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `EpkRange` produce correct targets. | +| Diagnostics — hierarchy | Verify recursive node tree structure appears in diagnostics JSON. | +| Diagnostics — children | Verify composite nodes contain child node diagnostics. | +| Diagnostics — backward compat | Verify `requests()` flattening returns all requests from nested nodes. | -### 14.2 Integration Tests +### 11.2 Integration Tests | Test Area | Cases | |-----------|-------| -| ReadMany — basic | Read 10 items across 3 partitions, verify all returned. | -| ReadMany — missing items | Read items where some don't exist, verify present items returned. | -| ReadMany — single partition | All items in one partition, verify no unnecessary fan-out. | -| ReadMany — partition split | Trigger split during ReadMany, verify Fetch step re-resolves and completes. | -| ReadMany — large set | Read 1000 items, verify server-side pagination within each range works. | -| Query — single partition | Execute paginated query, verify continuation threading. | -| Query — resume | Execute query, get continuation, pass token back in next call, verify continues. | -| Diagnostics | Verify RU charges are aggregated across fan-out steps. | -| Throughput control | Verify fan-out respects throughput control group limits. | - -### 14.3 Performance Tests +| ReadAll — basic | Read all items from a container, verify all returned in EPK order. | +| ReadAll — empty container | ReadAll on empty container returns no results, no continuation. | +| ReadAll — single partition | All items in one partition, verify SingleNode plan execution. | +| ReadAll — multi partition | Items across multiple partitions, verify sequential drain. | +| ReadAll — pagination | Verify continuation token threads correctly across pages. | +| ReadAll — resume | Get continuation mid-stream, resume from it, verify continued results. | +| ReadAll — resume across SDK versions | Serialize token, deserialize with newer SDK, verify resume works. | +| ReadAll — partition split | Trigger split during ReadAll, verify Fetch node re-resolves and completes. | +| ReadAll — large dataset | Read many items, verify all pages and partitions are drained. | +| Diagnostics — RU aggregation | Verify total RU charge sums across all pages. | +| Diagnostics — plan structure | Verify diagnostics JSON shows Drain/Fetch hierarchy with children. | + +### 11.3 Performance Tests | Test Area | Metric | |-----------|--------| | Point op overhead | Latency regression < 1% vs. direct `execute_single_operation`. | -| ReadMany fan-out | Latency scales sub-linearly with partition count (concurrency works). | -| Memory bounds | Peak memory for ReadMany of N items is O(N × item_size). | +| ReadAll latency | Sequential partition drain does not introduce unnecessary overhead. | --- -## 15. Future Work +## 12. Future Work + +### 12.1 ReadMany -### 15.1 Change Feed +ReadMany reads multiple items by (ID, PartitionKey) pairs. It requires grouping items by +PK range, creating concurrent `Fetch` nodes, and merging results via an `UnorderedMerge` +node. This adds concurrency control (semaphore-based) to the PlanExecutor and a new +`PlanNode::UnorderedMerge` variant. -The change feed is a specialized feed operation with unique characteristics: -- Start-from-beginning, start-from-now, or start-from-timestamp. -- Lease-based partition assignment (for multi-consumer scenarios). -- Scoped to feed ranges (EPK ranges). -- Incremental mode vs. full-fidelity mode. +### 12.2 Cross-Partition Queries -The current spec reserves extension points in `OperationPayload`, `OperationTarget`, -`PlanStep`, and `ResumeState` for change feed support. +Cross-partition queries require fetching a backend query plan, creating `Fetch` nodes per +partition, and optionally performing client-side sort for ORDER BY queries via an +`OrderedMerge` node. This adds query plan fetching callbacks to the Planner and k-way +merge logic to the PlanExecutor. -### 15.2 ORDER BY Merge-Sort +### 12.3 Change Feed + +The change feed is a specialized feed operation with unique characteristics: start-from +modes, lease-based partition assignment, and incremental/full-fidelity modes. + +Unlike ReadAll's sequential drain (where only the active partition's state is needed), +change feed requires **per-range continuation tokens**. Each feed range maintains its +own server continuation, and the resume state is a list of per-range tokens: + +```rust +// Future ResumeState variant (illustrative) +#[serde(rename = "changeFeed")] +ChangeFeed(ChangeFeedState), + +#[serde(rename_all = "camelCase")] +struct ChangeFeedState { + /// Per-range continuation tokens. + /// Each entry tracks one feed range's EPK bounds and its + /// server-provided continuation token. + range_tokens: Vec, +} + +#[serde(rename_all = "camelCase")] +struct RangeToken { + epk_min: String, + epk_max: String, + server_token: Option, +} +``` -Cross-partition queries with ORDER BY require a k-way merge of sorted partition streams. -This will be implemented as a `Sort` variant of `PlanStep` that consumes partition `Fetch` -step heads and produces globally ordered pages. +This is an example where per-partition state is necessary (the token size is O(N) in +range count), as noted in [§7.1 Design Principles](#71-design-principles). The plan +model reserves extension points in `PlanNode` and `ResumeState` for change feed support. -### 15.3 Aggregation +### 12.4 Concurrency -Queries with aggregation functions (COUNT, SUM, AVG, etc.) require client-side accumulation -across partitions. This will be implemented as an `Aggregate` variant of `PlanStep`. +Future operations (ReadMany, cross-partition queries) will require concurrent partition +fetching. The concurrency permit model described in [§5.6](#56-future-extensions) provides +the foundation: a shared semaphore limits the number of concurrent permits, and each plan +node acquires a permit before executing. This will add `UnorderedMerge` / `OrderedMerge` +nodes to the plan model. -### 15.4 Payload Awareness +### 12.5 Cached Operation Plans -For sort and aggregation, the driver must understand feed response envelopes (the JSON -structure containing the items array, count, etc.). This will require a light JSON parsing -layer in the executor, not full item deserialization. +For in-process callers that call `execute_operation` in a loop, caching the `OperationPlan` +across pages (invalidating on metadata changes) would avoid re-planning on every page. This +is a performance optimization, not a correctness concern. -### 15.5 Hedging for Feed Operations +### 12.6 Hedging for Feed Operations The existing hedging mechanism (speculative execution in secondary regions) could be extended -to individual plan steps, allowing fan-out fetches to hedge independently. +to individual plan nodes, allowing feed fetches to hedge independently. From 2ebf07aba078f19ac84fc9522e2623f1452390f7 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 30 Apr 2026 23:34:40 +0000 Subject: [PATCH 09/29] pr feedback --- .../docs/FEED_OPERATIONS_SPEC.md | 541 +++++++++++------- 1 file changed, 323 insertions(+), 218 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 72d26ffa467..6b33cdf331e 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -17,7 +17,7 @@ 6. [Plan Executor](#6-plan-executor) 7. [Continuation Tokens](#7-continuation-tokens) 8. [Diagnostics Structure](#8-diagnostics-structure) -9. [Error Handling & Partition Splits](#9-error-handling--partition-splits) +9. [Error Handling, Splits & Merges](#9-error-handling--partition-splits--merges) 10. [API Semantics & Invariants](#10-api-semantics--invariants) 11. [Testing Strategy](#11-testing-strategy) 12. [Future Work](#12-future-work) @@ -46,7 +46,7 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i 1. **Unified execution model** — Both point and feed operations flow through a common Plan → Execute pipeline. Point operations produce a trivial single-node plan. Feed operations produce multi-node plans that leverage the existing point-operation pipeline for individual - HTTP requests. + Cosmos requests. 2. **Resumable pagination** — Feed operations produce a typed continuation token that can be serialized to a string and carried across process boundaries (e.g., sent to a browser). @@ -61,9 +61,12 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i partition-level failover (PPAF/PPCB), throughput control, session consistency, and diagnostics — all managed by the driver. -5. **Schema-agnostic pages** — The driver returns response pages as raw bytes (`Vec`). - The higher-level SDK handles deserialization, consistent with the existing `CosmosResponse` - model. +5. **Schema-agnostic pages** — The driver returns feed pages as a list of pre-parsed item + bodies (`Vec>`), each entry being the raw serialized bytes of one item. Point + operations continue to return a single body (`Vec`). The driver does not deserialize + item bodies; the higher-level SDK handles deserialization. To support both shapes through + a single `CosmosResponse` type, this spec introduces a `ResponseBody` enum (analogous to + `OperationPayload` for requests) — see [§10.2 CosmosResponse Changes](#102-cosmosresponse-changes). 6. **Performance non-regression** — Point operations must not pay measurable overhead for the unified plan model. Trivial plans must be allocation-light. No heap allocation for trivial @@ -73,6 +76,8 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ### Non-Goals (This Spec) - Full cross-partition query execution with ORDER BY merge-sort and aggregation (future work). +- Backend query plan retrieval and interpretation (future work; required for cross-partition + queries but not for ReadAll). - Change feed full design (future work; this spec reserves extension points). - ReadMany fan-out with concurrent partition fetching (future work). - Client-side query rewriting or optimization. @@ -81,7 +86,10 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ### Primary Target **ReadAll** is the first feed operation to implement. It reads all documents from a container by -draining partitions sequentially in effective partition key (EPK) order. It exercises: +draining partitions sequentially in effective partition key (EPK) order. Items are returned in +their **natural order**: ascending by `(EffectivePartitionKey, RID)`. Within each partition the +server returns items in ascending RID order; across partitions the driver iterates partitions +in ascending EPK order. ReadAll exercises: - Partition key range resolution (via `PartitionKeyRangeCache`) - Sequential traversal across partition key ranges in EPK order @@ -95,76 +103,65 @@ Sections on continuation tokens and the plan model are designed to be extensible operations (ReadMany, cross-partition query, change feed) without requiring a redesign. **Ordering semantics:** ReadAll drains partitions in EPK order as an implementation behavior. -Within each partition, items are returned in (PartitionKey, ID) ascending order — the natural -sort order of `SELECT *`. This is a driver-emitted ordering, **not** a service-level ordering -guarantee. The service does not guarantee global cross-partition order without explicit -`ORDER BY`. +Within each partition, items are returned in ascending RID order — the natural sort order of +`SELECT *`. The combined output is therefore ascending by `(EffectivePartitionKey, RID)`. This +is a driver-emitted ordering, **not** a service-level ordering guarantee. The service does not +guarantee global cross-partition order without explicit `ORDER BY`. --- ## 2. Architectural Overview -```text -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ CosmosDriver │ -│ │ -│ execute_operation(op, opts) → CosmosResponse │ -│ │ -│ A single entry point for ALL operations (point and feed). │ -│ Returns a CosmosResponse which optionally includes a continuation │ -│ token. Point reads never have one; feed operations may. │ -│ The SDK layer decides which operations to expose as pagers. │ -│ │ -│ Internally: │ -│ 1. Planner creates an OperationPlan │ -│ 2. PlanExecutor runs one page of the plan │ -│ 3. Returns CosmosResponse (with optional continuation token) │ -│ │ -│ ┌──────────────────────────────────────────────────────────────────────────┐ │ -│ │ PLANNER │ │ -│ │ │ │ -│ │ Input: CosmosOperation + OperationOptions │ │ -│ │ Output: OperationPlan │ │ -│ │ │ │ -│ │ Responsibilities: │ │ -│ │ ┌─ Determine targeting (point EPK, sub-range, full key space) │ │ -│ │ ├─ For ReadAll: resolve PK ranges, create Drain over Fetch nodes │ │ -│ │ ├─ For single-partition ops: create single-node plan │ │ -│ │ └─ For point ops: create trivial single-node plan │ │ -│ └──────────────────────────────────────────────────────────────────────────┘ │ -│ │ │ -│ ▼ │ -│ ┌──────────────────────────────────────────────────────────────────────────┐ │ -│ │ PLAN EXECUTOR │ │ -│ │ │ │ -│ │ Input: OperationPlan │ │ -│ │ Output: CosmosResponse (single page) │ │ -│ │ │ │ -│ │ Responsibilities: │ │ -│ │ ┌─ Execute one Fetch node via execute_single_operation() │ │ -│ │ ├─ Handle partition splits (Fetch resolves EPK → PK ranges) │ │ -│ │ ├─ Collect node-level diagnostics (timing) │ │ -│ │ └─ Produce continuation token in response (if more pages remain) │ │ -│ └──────────────────────────────────────────────────────────────────────────┘ │ -│ │ │ -│ ▼ │ -│ ┌──────────────────────────────────────────────────────────────────────────┐ │ -│ │ OPERATION PIPELINE (existing) │ │ -│ │ │ │ -│ │ execute_single_operation() — unchanged │ │ -│ │ Handles: region failover, session tokens, transport retry, auth, │ │ -│ │ 429 backoff, diagnostics │ │ -│ └──────────────────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────────────────┘ +`CosmosDriver::execute_operation` is the single entry point for **all** operations — both +point and feed. The driver is stateless across calls: each invocation produces a fresh +`OperationPlan` (consulting the input continuation token if present), executes one page of +that plan, and returns a `CosmosResponse` with an optional continuation token. Point +operations always return without a continuation; feed operations return one when more pages +remain. The SDK layer decides which operations to expose to its callers as pagers. + +```mermaid +flowchart TB + Caller["SDK / caller
execute_operation(op, opts)"] + + subgraph Driver["CosmosDriver"] + direction TB + + Planner["Planner
──────────
Input: CosmosOperation + OperationOptions
Output: OperationPlan

• Determines targeting (point PK, FeedRange, full key space)
• ReadAll: resolves PK ranges → SequentialDrain over Request nodes
• Single-partition ops: single-node plan
• Point ops: trivial single-node plan"] + + Executor["PlanExecutor
──────────
Input: OperationPlan
Output: CosmosResponse (single page)

• Executes one Request node per call
• Handles partition splits / merges (Request re-resolves EPK → PK)
• Collects node-level diagnostics
• Builds continuation token if more pages remain"] + + Pipeline["Operation Pipeline (existing)
──────────
execute_single_operation()

• Region failover
• Session tokens
• Transport retry, auth, 429 backoff
• Per-request diagnostics"] + + Planner --> Executor + Executor --> Pipeline + end + + Caller -->|"CosmosOperation,
OperationOptions"| Planner + Pipeline -.->|"per-request
response"| Executor + Executor -->|"CosmosResponse
(+ continuation?)"| Caller + + classDef component fill:#f5f5f5,stroke:#333,stroke-width:1px,text-align:left + classDef caller fill:#e8f0ff,stroke:#333,stroke-width:1px + class Planner,Executor,Pipeline component + class Caller caller ``` +Internally, every call follows the same three-step flow: + +1. **Plan** — the Planner converts the `CosmosOperation` (plus any input continuation + token) into an `OperationPlan`. +2. **Execute one page** — the PlanExecutor walks the plan and issues exactly one Cosmos + request via `execute_single_operation`. +3. **Respond** — the executor returns a `CosmosResponse`, attaching a continuation token + when more pages remain. + ### Layer Separation The existing `execute_operation_pipeline` function is renamed to **`execute_single_operation`** in this spec. It remains the internal entry point for executing a single Cosmos DB operation through the operation pipeline (region failover, session tokens, transport retry, auth, 429 backoff, diagnostics). The feed operations layer calls `execute_single_operation` for each -individual HTTP request within a plan. +individual Cosmos request within a plan. | Concern | Component | Location | |---------|-----------|----------| @@ -173,7 +170,7 @@ individual HTTP request within a plan. | Plan model | `OperationPlan`, `PlanNode` | `driver/plan/plan.rs` (new) | | Plan execution | `PlanExecutor` | `driver/plan/executor.rs` (new) | | Continuation state | `ContinuationToken` | `models/continuation_token.rs` (new) | -| Per-node HTTP execution | `execute_single_operation` | `driver/pipeline/` (existing) | +| Per-node request execution | `execute_single_operation` | `driver/pipeline/` (existing) | ### Open Issue: Re-Planning on Every Page @@ -183,7 +180,7 @@ token to reconstruct the plan state, but still performs the full planning step ( resolution) on each page. For in-process callers (the common case), this is wasteful: the SDK crate calls -`execute_operation` in a loop, and the plan structure doesn't change between pages (Fetch +`execute_operation` in a loop, and the plan structure doesn't change between pages (Request nodes handle partition splits internally by re-resolving EPK ranges). A future optimization could allow `CosmosResponse` and/or `CosmosOperation` to carry a **cached `OperationPlan`** so that subsequent requests skip re-planning when the plan is still valid. The cached plan @@ -277,32 +274,49 @@ pub enum OperationTarget { /// value must be included in the request headers. PartitionKey(PartitionKey), - /// Target an effective partition key range. + /// Target a specific feed range. /// /// Used for feed operations that span one or more partitions. - /// Uses the existing `EpkRange` type from - /// `models::range`. + /// Uses the `FeedRange` type, which represents a contiguous span + /// of effective partition key (EPK) space. See §3.2.1 below for + /// the type's origin. /// - /// The pipeline resolves the EPK range to the owning PK range ID(s) via + /// The pipeline resolves the FeedRange to the owning PK range ID(s) via /// the `PartitionKeyRangeCache` at execution time. - EpkRange(EpkRange), + FeedRange(FeedRange), } ``` -**Implementation note:** `EpkRange` requires `T: Ord + Clone`. The driver's -`EffectivePartitionKey` type already implements `Ord`, so the existing implementation -is sufficient for `OperationTarget::EpkRange`. +#### 3.2.1 Migrating `FeedRange` from `azure_data_cosmos` + +The `FeedRange` type currently lives in `azure_data_cosmos::feed_range` (see +`sdk/cosmos/azure_data_cosmos/src/feed_range.rs`). It is the public, opaque, cross-SDK-compatible +representation of a contiguous EPK range, with stable wire formats (base64-encoded JSON via +`Display`/`FromStr`, and structured JSON via `Serialize`/`Deserialize`). + +This spec proposes **migrating `FeedRange` into the driver** (`azure_data_cosmos_driver`) so +that it can be used by `OperationTarget`, `ContinuationToken` resume state, and diagnostics +without crossing crate boundaries. The `azure_data_cosmos` crate then re-exports `FeedRange` +to preserve the existing public API. + +Rationale: +- The driver's `OperationTarget::FeedRange` variant must be public (`OperationTarget` is a + driver-public type), so it cannot use a `pub(crate)` driver-internal range type. +- `FeedRange` is already designed as a stable, cross-SDK-compatible type; promoting it to the + driver consolidates the canonical definition in one place. +- Other driver-internal range types (e.g., `EpkRange`) remain `pub(crate)` and continue to + serve their internal callers. + +Migration steps (out of scope for this spec, but for context): +1. Move `feed_range.rs` to `azure_data_cosmos_driver`. +2. Re-export `FeedRange` from `azure_data_cosmos` (e.g., `pub use azure_data_cosmos_driver::FeedRange;`). +3. Update internal driver code to consume `FeedRange` directly rather than its old location. ```rust impl OperationTarget { /// The full key space: targets all partition key ranges. pub fn all_ranges() -> Self { - Self::EpkRange(EpkRange::new( - EffectivePartitionKey::MIN, - EffectivePartitionKey::MAX, - true, - false, - )) + Self::FeedRange(FeedRange::all_ranges()) } } ``` @@ -352,7 +366,7 @@ method or via `.with_payload(...)`. A convenience method `with_body(Vec)` ca sugar for `with_payload(OperationPayload::Body(...))`. The transport pipeline's request builder must be updated to extract body bytes from -`OperationPayload` when constructing the HTTP request. For `Body` variants, this is +`OperationPayload` when constructing the Cosmos request. For `Body` variants, this is straightforward. For `None`, no body is sent. Future payload variants (Query, ReadMany) will be handled by the Planner before reaching the transport pipeline. @@ -437,42 +451,45 @@ pub(crate) enum OperationPlan { /// A node in an operation plan. /// /// Nodes reference each other via `NodeId` and `NodeRange` within the -/// flat node list. Composite nodes (Drain) reference child nodes; -/// leaf nodes (Fetch) have no children. +/// flat node list. Composite nodes (SequentialDrain) reference child nodes; +/// leaf nodes (Request) have no children. pub(crate) enum PlanNode { - /// Execute a single HTTP request via the operation pipeline. + /// Execute a single Cosmos request via the operation pipeline. /// - /// Each Fetch node targets a specific **EPK range** (not a PK range ID). + /// Each Request node targets a specific **EPK range** (not a PK range ID). /// At execution time, the node resolves its EPK range to the current PK - /// range ID(s) via the `PartitionKeyRangeCache`. If the EPK range maps - /// to multiple PK ranges (due to a partition split), the Fetch node - /// internally re-resolves and issues requests to the appropriate child - /// PK ranges. The next time the plan is generated, the EPK ranges will - /// reflect the split, and the plan resumes with the new ranges. - Fetch { + /// range ID(s) via the `PartitionKeyRangeCache`. The Request node handles + /// both **splits** (its EPK range maps to multiple child PK ranges) and + /// **merges** (its EPK range falls entirely within a larger merged PK + /// range) by issuing requests against the appropriate current PK ranges. + /// In the merge case, the Request must include EPK min/max headers so the + /// server only returns items inside the original range. The next time + /// the plan is generated, EPK ranges will reflect the new topology and + /// the plan resumes with the new ranges. + Request { /// The operation to execute, targeted to a specific EPK range. - /// Wrapped in `Arc` so that sibling Fetch nodes can share the base + /// Wrapped in `Arc` so that sibling Request nodes can share the base /// operation without cloning the full payload (headers, resource /// reference, etc.). operation: Arc, /// Options for this fetch. options: OperationOptions, /// The EPK range this fetch targets. - epk_range: EpkRange, + feed_range: FeedRange, /// Server-provided continuation token for this range, if resuming. continuation: Option, }, /// Sequential cross-partition drain. /// - /// Enumerates child Fetch nodes in EPK order, draining each partition + /// Enumerates child Request nodes in EPK order, draining each partition /// completely before moving to the next. Each page comes from exactly /// one partition — pages do not span partition boundaries. /// /// Within each partition, items are returned in (PartitionKey, ID) /// ascending order (the natural server sort order). - Drain { - /// Child Fetch nodes, ordered by EPK range. + SequentialDrain { + /// Child Request nodes, ordered by EPK range. /// References a contiguous range in the plan's node list. children: NodeRange, }, @@ -486,8 +503,8 @@ pub(crate) enum PlanNode { ### 4.2 Bottom-Up Invariant -The flat node list is always built **bottom-up**: leaf nodes (Fetch) are pushed first, -then their parent (Drain) is pushed after them. This produces a deterministic layout where +The flat node list is always built **bottom-up**: leaf nodes (Request) are pushed first, +then their parent (SequentialDrain) is pushed after them. This produces a deterministic layout where `NodeId` values are stable for a given set of inputs. For a ReadAll plan over 3 partitions, the node list looks like: @@ -495,26 +512,26 @@ For a ReadAll plan over 3 partitions, the node list looks like: ```text Index Node ───── ────────────────────────────────────────── - 0 Fetch { epk_range: ["","55"), ... } - 1 Fetch { epk_range: ["55","AA"), ... } - 2 Fetch { epk_range: ["AA","FF"), ... } - 3 Drain { children: NodeRange(0..3) } + 0 Request { feed_range: ["","55"), ... } + 1 Request { feed_range: ["55","AA"), ... } + 2 Request { feed_range: ["AA","FF"), ... } + 3 SequentialDrain { children: NodeRange(0..3) } root = NodeId(3) ``` -The `NodeRange(0..3)` for the Drain's children is a zero-cost reference to the contiguous -slice of Fetch nodes. No `Vec` allocation is needed. +The `NodeRange(0..3)` for the SequentialDrain's children is a zero-cost reference to the contiguous +slice of Request nodes. No `Vec` allocation is needed. ### 4.3 Plan Examples #### Point Operation (ReadItem) ```text -SingleNode(Fetch { operation: read_item, epk_range: pk_epk, continuation: None }) +SingleNode(Request { operation: read_item, feed_range: pk_epk, continuation: None }) ``` -A `SingleNode` plan with one `Fetch` node. The executor runs it directly, gets a +A `SingleNode` plan with one `Request` node. The executor runs it directly, gets a `CosmosResponse`, done. No heap allocation. #### ReadAll (Cross-Partition) @@ -522,17 +539,17 @@ A `SingleNode` plan with one `Fetch` node. The executor runs it directly, gets a ```text Graph { nodes: [ - 0: Fetch { epk_range: ["","55"), continuation: None }, - 1: Fetch { epk_range: ["55","AA"), continuation: None }, - 2: Fetch { epk_range: ["AA","FF"), continuation: None }, - 3: Drain { children: NodeRange(0..3) }, + 0: Request { feed_range: ["","55"), continuation: None }, + 1: Request { feed_range: ["55","AA"), continuation: None }, + 2: Request { feed_range: ["AA","FF"), continuation: None }, + 3: SequentialDrain { children: NodeRange(0..3) }, ], root: NodeId(3), } ``` The executor processes partitions sequentially: -1. Fetch all pages from EPK range `["","55")` until that partition is drained. +1. Request all pages from EPK range `["","55")` until that partition is drained. 2. Move to EPK range `["55","AA")`, fetch all pages. 3. Move to EPK range `["AA","FF")`, fetch all pages. @@ -550,15 +567,15 @@ starting from the active range: ```text Graph { nodes: [ - 0: Fetch { epk_range: ["55","AA"), continuation: Some("xyz") }, - 1: Fetch { epk_range: ["AA","FF"), continuation: None }, - 2: Drain { children: NodeRange(0..2) }, + 0: Request { feed_range: ["55","AA"), continuation: Some("xyz") }, + 1: Request { feed_range: ["AA","FF"), continuation: None }, + 2: SequentialDrain { children: NodeRange(0..2) }, ], root: NodeId(2), } ``` -Only the remaining partitions are in the plan. The first Fetch carries the server +Only the remaining partitions are in the plan. The first Request carries the server continuation from the token. ### 4.4 SingleNode Optimization @@ -577,7 +594,7 @@ direct `execute_single_operation` call. The `OperationPlan::SingleNode` variant ### 5.1 Responsibilities The Planner transforms a `CosmosOperation` into an `OperationPlan`. For ReadAll, this is -synchronous: resolve partition key ranges and build a `Drain` node over `Fetch` children. +synchronous: resolve partition key ranges and build a `SequentialDrain` node over `Request` children. ```rust pub(crate) struct Planner<'a> { @@ -589,7 +606,7 @@ impl<'a> Planner<'a> { /// Creates an operation plan from a CosmosOperation. /// /// For point operations, this is synchronous and trivial. - /// For ReadAll, this resolves PK ranges and builds a Drain plan. + /// For ReadAll, this resolves PK ranges and builds a SequentialDrain plan. pub async fn plan( &self, operation: &CosmosOperation, @@ -607,10 +624,10 @@ impl<'a> Planner<'a> { | Operation | Targeting | Plan Strategy | |-----------|-----------|---------------| -| ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Fetch` node. SingleNode. | -| CreateDatabase, ReadContainer, etc. | `None` | Single `Fetch` node. SingleNode. | -| ReadAllItems (single partition) | `PartitionKey` | Single `Fetch` node. Paginated. | -| ReadAllItems (cross-partition) | `EpkRange` (`all_ranges()`) | Resolve PK ranges → `Drain` over N `Fetch` nodes. Sequential. | +| ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Request` node. SingleNode. | +| CreateDatabase, ReadContainer, etc. | `None` | Single `Request` node. SingleNode. | +| ReadAllItems (single partition) | `PartitionKey` | Single `Request` node. Paginated. | +| ReadAllItems (cross-partition) | `FeedRange` (`all_ranges()`) | Resolve PK ranges → `SequentialDrain` over N `Request` nodes. Sequential. | ### 5.3 Pseudo-Code: Building a Trivial Plan @@ -620,8 +637,8 @@ operation or single-partition feed: ```rust // PSEUDO-CODE — illustrative, not compilable fn plan_trivial(operation: CosmosOperation, options: OperationOptions) -> OperationPlan { - OperationPlan::SingleNode(PlanNode::Fetch { - epk_range: operation.target().as_epk_range(), + OperationPlan::SingleNode(PlanNode::Request { + feed_range: operation.target().as_epk_range(), operation: Arc::new(operation), options, continuation: None, @@ -629,7 +646,7 @@ fn plan_trivial(operation: CosmosOperation, options: OperationOptions) -> Operat } ``` -No PK range resolution is needed. The operation is wrapped in a single `Fetch` node. +No PK range resolution is needed. The operation is wrapped in a single `Request` node. ### 5.4 Pseudo-Code: Building a ReadFeed Plan @@ -652,7 +669,7 @@ fn plan_read_feed( None => (EffectivePartitionKey::MIN, None), }; - // Build Fetch nodes bottom-up, one per PK range that hasn't been drained. + // Build Request nodes bottom-up, one per PK range that hasn't been drained. let shared_op = Arc::new(create_fetch_from(operation)); let mut nodes = Vec::new(); @@ -669,20 +686,20 @@ fn plan_read_feed( None }; - nodes.push(PlanNode::Fetch { + nodes.push(PlanNode::Request { operation: Arc::clone(&shared_op), - options: derive_fetch_options(range), - epk_range: range.epk_range(), + options: derive_request_options(range), + feed_range: range.feed_range(), continuation, }); } - // Push the Drain node after all its children (bottom-up invariant). + // Push the SequentialDrain node after all its children (bottom-up invariant). let children = NodeRange { start: NodeId(0), end: NodeId(nodes.len() as u32), }; - nodes.push(PlanNode::Drain { children }); + nodes.push(PlanNode::SequentialDrain { children }); let root = NodeId(nodes.len() as u32 - 1); OperationPlan::Graph { nodes, root } @@ -690,11 +707,11 @@ fn plan_read_feed( ``` Key points: -- Fetch nodes are pushed first (children), then the Drain (parent) — maintaining the +- Request nodes are pushed first (children), then the SequentialDrain (parent) — maintaining the bottom-up invariant. - On resume, ranges left of the continuation's EPK min are skipped entirely. The first - remaining Fetch carries the server token from the continuation. -- All Fetch nodes share the base operation via `Arc`, avoiding clones of headers and + remaining Request carries the server token from the continuation. +- All Request nodes share the base operation via `Arc`, avoiding clones of headers and resource references. ### 5.5 Resuming from a Continuation Token @@ -703,17 +720,17 @@ When a `ContinuationToken` is provided, the Planner validates it (version, conta operation kind), resolves the current partition key ranges, and uses the token's resume state to reconstruct the plan at the correct position. -The resume algorithm for `Drain` is described in [§7.3 Resume Strategy](#73-resume-strategy). +The resume algorithm for `SequentialDrain` is described in [§7.3 Resume Strategy](#73-resume-strategy). ### 5.6 Future Extensions The Planner architecture supports future operations without redesign: -- **ReadMany**: Group items by PK range, create concurrent `Fetch` nodes with an +- **ReadMany**: Group items by PK range, create concurrent `Request` nodes with an `UnorderedMerge` parent. Requires adding concurrency support to the PlanExecutor. -- **Cross-partition query**: Fetch a backend query plan, create `Fetch` nodes per +- **Cross-partition query**: Request a backend query plan, create `Request` nodes per partition, optionally with `OrderedMerge` for ORDER BY queries. -- **Change feed**: Create `Fetch` nodes scoped to feed ranges with change-feed-specific +- **Change feed**: Create `Request` nodes scoped to feed ranges with change-feed-specific continuation state. Add a parent merge node based on change-feed merge semantics. - **Concurrency management**: All plan nodes receive a **concurrency permit** (semaphore token) during execution. For ReadAll, the executor holds a single permit — sequential @@ -736,7 +753,7 @@ impl PlanExecutor { /// Executes one page of the plan, producing a `CosmosResponse`. /// /// The response includes a continuation token if more pages are available. - /// Each call executes exactly one HTTP request to one partition. + /// Each call executes exactly one Cosmos request to one partition. pub async fn execute( plan: &OperationPlan, driver_context: &DriverContext, @@ -747,7 +764,7 @@ impl PlanExecutor { } ``` -The following pseudo-code illustrates the core execution loop for a `Drain` plan. +The following pseudo-code illustrates the core execution loop for a `SequentialDrain` plan. Function names are descriptive; their implementations are not shown. ```rust @@ -758,9 +775,9 @@ async fn execute_plan( diagnostics: &mut DiagnosticsContextBuilder, ) -> Result { match plan { - OperationPlan::SingleNode(fetch) => { + OperationPlan::SingleNode(request) => { // Point ops and single-partition feeds: execute directly. - execute_fetch_node(fetch, driver_context, diagnostics).await + execute_request_node(request, driver_context, diagnostics).await } OperationPlan::Graph { nodes, root } => { let root_node = &nodes[root.0 as usize]; @@ -776,28 +793,28 @@ async fn execute_node( diagnostics: &mut DiagnosticsContextBuilder, ) -> Result { match node { - PlanNode::Fetch { .. } => { - execute_fetch_node(node, driver_context, diagnostics).await + PlanNode::Request { .. } => { + execute_request_node(node, driver_context, diagnostics).await } - PlanNode::Drain { children } => { - // Find the active child: the first Fetch that hasn't been drained. + PlanNode::SequentialDrain { children } => { + // Find the active child: the first Request that hasn't been drained. // On a fresh plan, this is children.start. On resume, the Planner // has already pruned drained partitions, so children.start is the // active one. let active_id = children.start; - let active_fetch = &all_nodes[active_id.0 as usize]; + let active_request = &all_nodes[active_id.0 as usize]; // Acquire a concurrency permit (sequential: only one permit). let _permit = acquire_concurrency_permit(driver_context).await; // Execute one page from the active partition. - let response = execute_fetch_node( - active_fetch, driver_context, diagnostics + let response = execute_request_node( + active_request, driver_context, diagnostics ).await?; // Build the continuation token based on what happened. let continuation = build_drain_continuation( - &response, active_fetch, active_id, children, all_nodes + &response, active_request, active_id, children, all_nodes ); Ok(response.with_continuation(continuation)) @@ -833,6 +850,13 @@ Continuation tokens must be: `serde`'s tagged enum deserialization handles unknown variants gracefully (they fail to parse, which is the correct behavior when an older SDK encounters a token from a newer one). + **Version preservation across resume:** When resuming from an input continuation token, + the SDK MUST emit any output continuation token using the **same version** as the input + token. This guarantees that a caller persisting the token across pages does not observe + a version "drift" mid-operation: a token started at version N continues to round-trip as + version N until the operation completes, even if the SDK has since added support for a + higher version. The SDK only emits the latest version when no input token is provided. + 3. **Aim for O(1) size** — Token size should ideally be constant regardless of partition count. For ReadAll, only the state of the currently-active partition is stored, and other partitions' positions are reconstructed from EPK bounds on resume. However, per-partition @@ -898,13 +922,13 @@ enum ResumeState { /// On resume, ranges with max ≤ `epk_min` are skipped (already drained). /// The range matching `[epk_min, epk_max)` resumes from `server_token`. /// Ranges after `epk_max` start fresh. - #[serde(rename = "drain")] - Drain(DrainState), + #[serde(rename = "sequentialDrain")] + SequentialDrain(SequentialDrainState), - /// A single partition fetch, mid-stream or just completed. + /// A single partition request, mid-stream or just completed. /// Used as the root resume state for single-partition feed operations. - #[serde(rename = "fetch")] - Fetch(FetchState), + #[serde(rename = "request")] + Request(RequestState), // Future variants (added without changing token version): // @@ -917,10 +941,10 @@ enum ResumeState { // OrderedMerge(OrderedMergeState), } -/// Resume state for a Drain node. +/// Resume state for a SequentialDrain node. #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] -struct DrainState { +struct SequentialDrainState { /// EPK minimum of the current active feed range. /// All ranges with max ≤ this value have been fully drained. epk_min: String, @@ -935,10 +959,10 @@ struct DrainState { server_token: Option, } -/// Resume state for a single-partition Fetch node. +/// Resume state for a single-partition Request node. #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] -struct FetchState { +struct RequestState { /// EPK min inclusive of the target range. epk_min: String, @@ -959,11 +983,11 @@ struct FetchState { | | `container_rid` | `containerRid` | Container RID (string) | | | `operation_kind` | `operationKind` | Operation kind (e.g., `"readAll"`) | | | `resume` | `resume` | `ResumeState` (tagged union) | -| `DrainState` | *(tag)* | `type` | `"drain"` | +| `SequentialDrainState` | *(tag)* | `type` | `"sequentialDrain"` | | | `epk_min` | `epkMin` | EPK min inclusive (hex string) | | | `epk_max` | `epkMax` | EPK max exclusive (hex string) | | | `server_token` | `serverToken` | Server continuation (omitted if null) | -| `FetchState` | *(tag)* | `type` | `"fetch"` | +| `RequestState` | *(tag)* | `type` | `"request"` | | | `epk_min` | `epkMin` | EPK min inclusive (hex string) | | | `epk_max` | `epkMax` | EPK max exclusive (hex string) | | | `server_token` | `serverToken` | Server continuation (omitted if null) | @@ -973,9 +997,9 @@ struct FetchState { On resume, the Planner validates the token and uses the resume state to reconstruct the plan at the correct position. -#### `Drain` (sequential cross-partition) +#### `SequentialDrain` (sequential cross-partition) -The `DrainState` tracks the cursor position via EPK bounds. On resume: +The `SequentialDrainState` tracks the cursor position via EPK bounds. On resume: | Partition position | Action | |--------------------|--------| @@ -988,9 +1012,9 @@ to assign the server continuation to the appropriate child range. The `server_to to the first sub-range that overlaps the original EPK bounds; subsequent sub-ranges start fresh. -#### `Fetch` (leaf — single partition) +#### `Request` (leaf — single partition) -A bare `FetchState` at the root (no wrapping `Drain`) represents a single-partition operation. +A bare `RequestState` at the root (no wrapping `SequentialDrain`) represents a single-partition operation. Resume uses `server_token` directly. ### 7.4 Serialization @@ -1039,7 +1063,7 @@ JSON (before base64 encoding): "containerRid": "dbs/abc/colls/def", "operationKind": "readAll", "resume": { - "type": "drain", + "type": "sequentialDrain", "epkMin": "55", "epkMax": "AA", "serverToken": "+RID:~abc123#RT:1#TRC:10#ISV:2#IEO:65551" @@ -1058,7 +1082,7 @@ skipped. The range `["55","AA")` resumes from `serverToken`. Ranges after `"AA"` "containerRid": "dbs/abc/colls/def", "operationKind": "readAll", "resume": { - "type": "drain", + "type": "sequentialDrain", "epkMin": "55", "epkMax": "AA" } @@ -1070,7 +1094,7 @@ skips everything up to and including this range, and starts the next partition f **Single-partition feed, mid-stream** -A bare `FetchState` at the root (no wrapping layer): +A bare `RequestState` at the root (no wrapping layer): ```json { @@ -1078,7 +1102,7 @@ A bare `FetchState` at the root (no wrapping layer): "containerRid": "dbs/abc/colls/def", "operationKind": "readAll", "resume": { - "type": "fetch", + "type": "request", "epkMin": "55", "epkMax": "AA", "serverToken": "-RID:QmFzZTY0#RT:3#TRC:50" @@ -1101,8 +1125,11 @@ A continuation token is **invalidated** by: A continuation token **survives**: -1. **Partition splits** — The token stores EPK bounds, not PK range IDs. On resume, the - Planner re-resolves EPK bounds to current PK range IDs. +1. **Partition splits and merges** — The token stores EPK bounds, not PK range IDs. On resume, + the Planner re-resolves EPK bounds to current PK range IDs. After a split, an original + range maps to multiple child ranges; after a merge, multiple original ranges map to a + single combined range. Either way, the EPK bounds in the token still identify the exact + slice of the EPK space that has (or hasn't) been drained. 2. **SDK version upgrades** — The token is versioned. Older token versions are supported by newer SDKs (backward compatible deserialization). 3. **Process boundaries** — The token is a self-contained string, safe to send to a browser @@ -1112,7 +1139,7 @@ A continuation token **survives**: ### 7.6 What the Token Does NOT Encode -- **Per-range state for all partitions (for Drain)** — Only the active range's state is +- **Per-range state for all partitions (for SequentialDrain)** — Only the active range's state is stored. Other partitions' positions are reconstructed from the EPK bounds on resume. Other node types may store per-range state if needed (see §12.3 Change Feed). - **Query text or parameters** — The caller must provide an equivalent `CosmosOperation`. @@ -1139,20 +1166,20 @@ enough detail for the SDK to reconstruct the full execution timeline. ### 8.2 Hierarchy: Plan → Node → Request Each `execute_operation` call produces a `DiagnosticsContext` with a hierarchical view of the -operation plan's execution. The hierarchy mirrors the plan graph: composite nodes (Drain) -contain child node diagnostics, and leaf nodes (Fetch) contain HTTP request diagnostics. +operation plan's execution. The hierarchy mirrors the plan graph: composite nodes (SequentialDrain) +contain child node diagnostics, and leaf nodes (Request) contain Cosmos request diagnostics. ```text DiagnosticsContext ├── activityId, totalDurationMs, totalRequestCharge │ └── operationPlan (NodeDiagnostics) - ├── nodeType: "drain" + ├── nodeType: "sequentialDrain" ├── startedAt, completedAt, durationMs │ └── children[] └── [0] NodeDiagnostics - ├── nodeType: "fetch" + ├── nodeType: "request" ├── epkRange: { min, max } ├── startedAt, completedAt, durationMs ├── requestCharge @@ -1162,15 +1189,15 @@ DiagnosticsContext │ ├── [0] RequestDiagnostics (initial attempt) │ └── [1] RequestDiagnostics (retry, if any) │ - └── children[] (empty for Fetch) + └── children[] (empty for Request) ``` Every node holds a list of diagnostics from the child nodes it triggered (`children`), -as well as its own HTTP requests. This makes the diagnostics structure recursive and +as well as its own Cosmos requests. This makes the diagnostics structure recursive and directly mirrors the plan graph. For point operations (SingleNode plan), the hierarchy collapses: the `operationPlan` -is a single Fetch node with its requests and no children. The existing flat `requests()` +is a single Request node with its requests and no children. The existing flat `requests()` accessor is preserved for backward compatibility by flattening the tree. ### 8.3 Hierarchical Diagnostics Types @@ -1178,15 +1205,15 @@ accessor is preserved for backward compatibility by flattening the tree. ```rust /// Diagnostics for a single plan node's execution. /// -/// This type is recursive: composite nodes (Drain) contain child +/// This type is recursive: composite nodes (SequentialDrain) contain child /// `NodeDiagnostics` entries, mirroring the plan graph structure. pub struct NodeDiagnostics { /// What kind of node this was. node_type: NodeType, - /// The EPK range targeted by this node (for Fetch nodes). - /// `None` for non-fetch nodes. - epk_range: Option>, + /// The EPK range targeted by this node (for Request nodes). + /// `None` for non-Request nodes. + feed_range: Option, /// When the node started executing. started_at: Instant, @@ -1200,15 +1227,15 @@ pub struct NodeDiagnostics { /// Total RU charge for this node (including children). request_charge: RequestCharge, - /// Individual HTTP request diagnostics for this node. - /// Empty for non-leaf nodes that don't directly issue HTTP requests. + /// Individual Cosmos request diagnostics for this node. + /// Empty for non-leaf nodes that don't directly issue Cosmos requests. /// May contain multiple entries due to retries within the node. requests: Vec, - /// Child node diagnostics, for composite nodes (Drain, future merge nodes). - /// Empty for leaf nodes (Fetch). - /// For Drain, contains only the nodes that were executed in this call - /// (typically one Fetch node per page). + /// Child node diagnostics, for composite nodes (SequentialDrain, future merge nodes). + /// Empty for leaf nodes (Request). + /// For SequentialDrain, contains only the nodes that were executed in this call + /// (typically one Request node per page). children: Vec, /// Outcome of this node's execution. @@ -1228,10 +1255,10 @@ pub enum NodeOutcome { #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] #[serde(rename_all = "camelCase")] pub enum NodeType { - /// A Fetch node that executed an HTTP request via execute_single_operation. - Fetch, - /// A Drain node that sequentially processes partitions. - Drain, + /// A Request node that executed an Cosmos request via execute_single_operation. + Request, + /// A SequentialDrain node that sequentially processes partitions. + SequentialDrain, // Future: UnorderedMerge, OrderedMerge, Aggregate, etc. } ``` @@ -1248,7 +1275,7 @@ names: "totalRequestCharge": 5.23, "requestCount": 1, "operationPlan": { - "nodeType": "drain", + "nodeType": "sequentialDrain", "startedAt": 0, "completedAt": 42, "durationMs": 42, @@ -1257,7 +1284,7 @@ names: "requests": [], "children": [ { - "nodeType": "fetch", + "nodeType": "request", "epkRange": { "min": "00", "max": "55" }, "startedAt": 0, "completedAt": 15, @@ -1295,8 +1322,8 @@ names: } ``` -For point operations, the structure is similar but with a single Fetch node and no wrapping -Drain: +For point operations, the structure is similar but with a single Request node and no wrapping +SequentialDrain: ```json { @@ -1305,7 +1332,7 @@ Drain: "totalRequestCharge": 1.0, "requestCount": 1, "operationPlan": { - "nodeType": "fetch", + "nodeType": "request", "epkRange": null, "durationMs": 8, "requestCharge": 1.0, @@ -1327,7 +1354,7 @@ impl DiagnosticsContext { /// Returns the plan diagnostics for this operation. pub fn operation_plan(&self) -> &NodeDiagnostics { ... } - /// Returns all HTTP request diagnostics, flattened across nodes. + /// Returns all Cosmos request diagnostics, flattened across nodes. /// /// This is backward-compatible with the pre-feed-operations API. /// Requests are returned in the order they were executed. @@ -1345,7 +1372,7 @@ impl DiagnosticsContextBuilder { pub(crate) fn start_node( &mut self, node_type: NodeType, - epk_range: Option>, + feed_range: Option, ) -> NodeHandle { ... } /// Records that a node has completed, with its requests and children. @@ -1386,34 +1413,60 @@ pagination and can: --- -## 9. Error Handling & Partition Splits +## 9. Error Handling & Partition Splits & Merges ### 9.1 Partition Split During Execution -Fetch nodes target **EPK ranges**, not PK range IDs. When a Fetch node receives a 410/1002 -(Gone — PartitionKeyRangeGone) response, the Fetch node handles the split **internally**: +Request nodes target **EPK ranges**, not PK range IDs. When a Request node receives a 410/1002 +(Gone — PartitionKeyRangeGone) response, the Request node handles the split **internally**: 1. **Invalidate** the `PartitionKeyRangeCache` for the affected container. 2. **Re-fetch** the partition key ranges. -3. **Re-resolve** the Fetch node's EPK range to the new child PK range IDs. -4. **Internally split** — the single Fetch range issues requests to the appropriate +3. **Re-resolve** the Request node's EPK range to the new child PK range IDs. +4. **Internally split** — the single Request range issues requests to the appropriate child PK ranges. 5. **Resume execution** with the child range result. -The plan structure remains stable across splits. The Fetch node absorbs the split +The plan structure remains stable across splits. The Request node absorbs the split internally without changing the plan graph. The next time the plan is generated (on the next page), the Planner will see the new split ranges from the PK range cache and create -separate Fetch nodes for each child range — the continuation token's EPK bounds guide +separate Request nodes for each child range — the continuation token's EPK bounds guide the resume position correctly. The continuation token survives because it stores EPK bounds (not PK range IDs), and the Planner re-resolves those bounds to current PK range IDs on each page. -### 9.2 Error Propagation +### 9.2 Partition Merge During Execution + +Cosmos DB may also **merge** adjacent partitions to consolidate underutilized capacity. +After a merge, multiple original PK ranges become one larger PK range. The Request node's +EPK bounds may now fall entirely **inside** a larger merged PK range — the EPK range did +not change, but its owning PK range did. + +Merge handling: + +1. **Cache miss / 410** — A Request may detect the merge via either a stale-cache PK range + ID (the old PK range no longer exists) or via a 410/1002 response. The handling mirrors + the split path: invalidate the cache, re-fetch ranges, re-resolve EPK bounds. +2. **EPK bounds preserved on the wire** — When the Request issues requests against the + merged PK range, it MUST include `x-ms-documentdb-epk-min` and `x-ms-documentdb-epk-max` + headers set to its original EPK bounds. This ensures the server returns only items + inside the Request's intended slice of the merged range, not the entire merged range. +3. **Continuation token survival** — The continuation token's EPK bounds remain valid. + On the next page, the Planner sees the merged PK range and may produce a single + Request node spanning what was previously multiple ranges. The token's EPK bounds + correctly identify the cursor position inside the merged range. + +The plan structure changes across pages (fewer Request nodes after a merge), but the +continuation token's semantics are unchanged: it identifies a slice of the EPK space +that has been drained, regardless of how that slice maps to PK ranges. + +### 9.3 Error Propagation | Error Scenario | Behavior | |----------------|----------| -| 410/1002 (PartitionKeyRangeGone) | Fetch node internally re-resolves EPK range, retries. | +| 410/1002 (PartitionKeyRangeGone) — split | Request node internally re-resolves EPK range, retries against child PK ranges. | +| 410/1002 (PartitionKeyRangeGone) — merge | Request node internally re-resolves EPK range, retries against the merged PK range with EPK min/max headers. | | 429 (Throttled) | Handled by transport pipeline (backoff + retry). | | 503 (Service Unavailable) | Handled by operation pipeline (region failover). | | 404 (Not Found) — container | Fail the entire feed operation. | @@ -1454,13 +1507,39 @@ impl CosmosDriver { ### 10.2 CosmosResponse Changes -`CosmosResponse` gains an optional continuation token: +`CosmosResponse` gains an optional continuation token, and its `body` field becomes a +`ResponseBody` enum to support both single-document responses (point operations) and +multi-document responses (feed operations) without forcing every caller to parse a feed +envelope: ```rust +/// The body of a Cosmos DB response. +/// +/// Mirrors `OperationPayload` on the request side: each variant carries +/// exactly the data shape expected for its kind of operation, and the +/// driver does not deserialize item content. +#[non_exhaustive] +pub enum ResponseBody { + /// No body (e.g., 204 No Content). + None, + + /// A single document body — raw serialized bytes. + /// Used for point operations (read, create, upsert, replace) and for + /// resource reads (database, container). + Single(Vec), + + /// A list of document bodies — one entry per item, each entry being + /// the raw serialized bytes of one item. + /// Used for feed operations (ReadAll, future query/read-many). + /// The driver parses the response envelope to split items into a + /// `Vec>` but does not deserialize the items themselves. + Items(Vec>), +} + #[non_exhaustive] pub struct CosmosResponse { - /// Raw response body (UTF-8 JSON or Cosmos binary encoding). - body: Vec, + /// Response body. Variant depends on operation type. + body: ResponseBody, /// Extracted Cosmos-specific headers. headers: CosmosResponseHeaders, @@ -1478,6 +1557,11 @@ pub struct CosmosResponse { } impl CosmosResponse { + /// Returns the response body. + pub fn body(&self) -> &ResponseBody { + &self.body + } + /// Returns the continuation token, if more pages are available. /// /// For point operations, this always returns `None`. @@ -1488,6 +1572,10 @@ impl CosmosResponse { } ``` +The `Items(Vec>)` shape lets the SDK iterate items and apply per-item +deserialization (with per-item error handling) without first parsing the entire +feed envelope itself. + ### 10.3 OperationOptions Changes `OperationOptions` gains feed-specific fields: @@ -1497,6 +1585,20 @@ pub struct OperationOptions { // ... existing fields (retry, timeout, consistency, etc.) ... /// Maximum number of items per page (feed operations only). + /// + /// **This is always a hint.** The driver and the server may exceed it in + /// well-defined cases: + /// + /// - The server may return fewer items than requested (e.g., a partition + /// has fewer items than `max_item_count`). + /// - Some operations require returning a logical group of items together, + /// even if that group exceeds `max_item_count`. The most prominent case + /// is the change feed, where all documents sharing the same LSN + /// (logical sequence number) are returned in the same page to preserve + /// atomicity. ReadAll does not have this constraint today, but the + /// contract is the same: callers MUST treat `max_item_count` as a hint, + /// not a hard cap. + /// /// If not set, the server default applies. max_item_count: Option, @@ -1521,6 +1623,9 @@ Each `execute_operation` call for ReadAll returns exactly one page from exactly - **Server-side max item count**: The server may return fewer items than requested. - **Client-side max item count**: Configurable via `OperationOptions::max_item_count`. + This is **always a hint** — the driver may exceed it when an operation requires + returning a logical group of items together (e.g., change feed returns all documents + sharing the same LSN in the same page). Callers MUST NOT treat the value as a hard cap. - **Server continuation**: A page boundary occurs whenever the server returns a continuation token. - **Partition boundary**: When a partition is fully drained (no server continuation), the @@ -1537,11 +1642,11 @@ Pages never span partition boundaries. | Test Area | Cases | |-----------|-------| | Planner — point ops | Verify SingleNode plan for each point operation type. | -| Planner — ReadAll | Verify Graph plan with Drain root, correct Fetch children per PK range. | +| Planner — ReadAll | Verify Graph plan with SequentialDrain root, correct Request children per PK range. | | Planner — ReadAll resume | Verify resume skips drained partitions, resumes active, starts right fresh. | | Planner — bottom-up invariant | Verify children always have lower NodeIds than parents. | | PlanExecutor — single node | Execute SingleNode plan, verify result matches direct pipeline call. | -| PlanExecutor — drain | Execute Drain plan with mock pipeline, verify sequential execution. | +| PlanExecutor — drain | Execute SequentialDrain plan with mock pipeline, verify sequential execution. | | PlanExecutor — drain page boundary | Verify pages don't span partition boundaries. | | ContinuationToken — serialize | Serialize to base64url string, verify roundtrip. | | ContinuationToken — deserialize | Deserialize from explicit string, verify result. | @@ -1549,11 +1654,11 @@ Pages never span partition boundaries. | ContinuationToken — future version | Token with version > current is rejected. | | ContinuationToken — operation kind | Token with wrong operation kind is rejected. | | ContinuationToken — split recovery | Token with EPK bounds spanning a split range maps to correct child ranges. | -| ContinuationToken — Drain resume | Drain node correctly classifies partitions as left/target/right. | +| ContinuationToken — SequentialDrain resume | SequentialDrain node correctly classifies partitions as left/target/right. | | ContinuationToken — nesting | Nested tokens round-trip correctly through serialize/deserialize. | | ContinuationToken — unknown variant | Unknown `ResumeState` type fails gracefully on deserialize. | | NodeId/NodeRange | Verify range iteration, length, empty checks. | -| OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `EpkRange` produce correct targets. | +| OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `FeedRange` produce correct targets. | | Diagnostics — hierarchy | Verify recursive node tree structure appears in diagnostics JSON. | | Diagnostics — children | Verify composite nodes contain child node diagnostics. | | Diagnostics — backward compat | Verify `requests()` flattening returns all requests from nested nodes. | @@ -1569,10 +1674,10 @@ Pages never span partition boundaries. | ReadAll — pagination | Verify continuation token threads correctly across pages. | | ReadAll — resume | Get continuation mid-stream, resume from it, verify continued results. | | ReadAll — resume across SDK versions | Serialize token, deserialize with newer SDK, verify resume works. | -| ReadAll — partition split | Trigger split during ReadAll, verify Fetch node re-resolves and completes. | +| ReadAll — partition split | Trigger split during ReadAll, verify Request node re-resolves and completes. | | ReadAll — large dataset | Read many items, verify all pages and partitions are drained. | | Diagnostics — RU aggregation | Verify total RU charge sums across all pages. | -| Diagnostics — plan structure | Verify diagnostics JSON shows Drain/Fetch hierarchy with children. | +| Diagnostics — plan structure | Verify diagnostics JSON shows SequentialDrain/Request hierarchy with children. | ### 11.3 Performance Tests @@ -1588,13 +1693,13 @@ Pages never span partition boundaries. ### 12.1 ReadMany ReadMany reads multiple items by (ID, PartitionKey) pairs. It requires grouping items by -PK range, creating concurrent `Fetch` nodes, and merging results via an `UnorderedMerge` +PK range, creating concurrent `Request` nodes, and merging results via an `UnorderedMerge` node. This adds concurrency control (semaphore-based) to the PlanExecutor and a new `PlanNode::UnorderedMerge` variant. ### 12.2 Cross-Partition Queries -Cross-partition queries require fetching a backend query plan, creating `Fetch` nodes per +Cross-partition queries require fetching a backend query plan, creating `Request` nodes per partition, and optionally performing client-side sort for ORDER BY queries via an `OrderedMerge` node. This adds query plan fetching callbacks to the Planner and k-way merge logic to the PlanExecutor. From 36ce91b25dc4dbbb0bbd4ddf70dcb633d2164810 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Fri, 1 May 2026 19:39:26 +0000 Subject: [PATCH 10/29] rename ResponseBody::Single to ResponseBody::Bytes --- .../docs/FEED_OPERATIONS_SPEC.md | 591 +++++++++++++++--- 1 file changed, 498 insertions(+), 93 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 6b33cdf331e..753b63b80c9 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -1,7 +1,7 @@ # Feed Operations Spec for `azure_data_cosmos_driver` **Status:** Draft / Iterating -**Date:** 2026-04-28 +**Date:** 2026-05-05 **Authors:** (team) **Crate:** `azure_data_cosmos_driver` @@ -33,7 +33,7 @@ and produce a single response. Operations like `ReadItem`, `UpsertItem`, and `De through `execute_operation`, which drives the operation pipeline (region failover, session tokens, transport retry) and returns a single `CosmosResponse`. -**Feed operations** — read-all-items, queries, read-many, and change feed — are fundamentally +**Feed operations** — queries, read-many, and change feed — are fundamentally different. They produce multiple pages of results, may span multiple partition key ranges, and need pagination state that can be serialized across request boundaries. @@ -53,9 +53,9 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i Resuming with a valid continuation token and an equivalent operation descriptor continues where the previous execution left off. -3. **Extensible operation model** — The plan model must support ReadAll (the initial target), - cross-partition queries, single-partition queries/reads, read-many, and change feed, even if - some are implemented later. +3. **Extensible operation model** — The plan model must support `SELECT * [WHERE …]` queries + (the initial target), cross-partition queries with `ORDER BY` / aggregates, single-partition + queries/reads, read-many, and change feed, even if some are implemented later. 4. **Driver-level concerns** — Feed operations must integrate with multi-region failover, partition-level failover (PPAF/PPCB), throughput control, session consistency, and @@ -75,9 +75,12 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ### Non-Goals (This Spec) -- Full cross-partition query execution with ORDER BY merge-sort and aggregation (future work). +- Cross-partition query execution with cross-partition `ORDER BY` merge-sort, `GROUP BY`, or + cross-partition aggregation (future work). Single-partition queries with `ORDER BY` / `GROUP BY` + / aggregates *are* in scope and pass through verbatim (see [§5.2](#52-planning-logic-by-operation-type)). - Backend query plan retrieval and interpretation (future work; required for cross-partition - queries but not for ReadAll). + queries with `ORDER BY` / aggregates and for vector / hybrid queries, but not for the + in-scope cases). - Change feed full design (future work; this spec reserves extension points). - ReadMany fan-out with concurrent partition fetching (future work). - Client-side query rewriting or optimization. @@ -85,11 +88,20 @@ failover, partition-level circuit breaker, throughput control, and diagnostics i ### Primary Target -**ReadAll** is the first feed operation to implement. It reads all documents from a container by -draining partitions sequentially in effective partition key (EPK) order. Items are returned in -their **natural order**: ascending by `(EffectivePartitionKey, RID)`. Within each partition the -server returns items in ascending RID order; across partitions the driver iterates partitions -in ascending EPK order. ReadAll exercises: +**`SELECT * [WHERE ]` queries** are the first feed operations to implement. The +unfiltered form (`SELECT * FROM c`) is the simplest case; with an optional `WHERE` clause +the same code path supports server-side filtering. Both forms drain +partitions sequentially in effective partition key (EPK) order. Items are returned in their +**natural order**: ascending by `(EffectivePartitionKey, RID)`. Within each partition the server +returns items in ascending RID order; across partitions the driver iterates partitions in +ascending EPK order. + +This first target deliberately excludes any query feature that requires a backend query plan +to execute correctly across partitions — `ORDER BY` (cross-partition), `GROUP BY`, `DISTINCT`, +aggregates (`COUNT`, `SUM`, …), `OFFSET / LIMIT`, vector search, hybrid search, etc. Those are +covered separately under cross-partition queries in [§12.2](#122-cross-partition-queries). + +The in-scope shape exercises: - Partition key range resolution (via `PartitionKeyRangeCache`) - Sequential traversal across partition key ranges in EPK order @@ -97,16 +109,23 @@ in ascending EPK order. ReadAll exercises: - Paginated reads within each partition - Continuation token serialization and resume across SDK versions - Integration with the operation pipeline for each sub-request - -This spec is **complete when ReadAll works end-to-end** through the Plan → Execute pipeline. -Sections on continuation tokens and the plan model are designed to be extensible for future -operations (ReadMany, cross-partition query, change feed) without requiring a redesign. - -**Ordering semantics:** ReadAll drains partitions in EPK order as an implementation behavior. -Within each partition, items are returned in ascending RID order — the natural sort order of -`SELECT *`. The combined output is therefore ascending by `(EffectivePartitionKey, RID)`. This -is a driver-emitted ordering, **not** a service-level ordering guarantee. The service does not -guarantee global cross-partition order without explicit `ORDER BY`. +- Per-fetch header overrides (EPK bounds, server continuation, page size) applied without + rebuilding the base `CosmosOperation` — see [§6.3 OperationOverrides](#63-operationoverrides) + +This spec is **complete when `SELECT * [WHERE …]` works end-to-end** through the Plan → Execute +pipeline, both as a cross-partition operation (`OperationTarget::FeedRange`) and as a +single-partition operation (`OperationTarget::PartitionKey`). Sections on continuation tokens +and the plan model are designed to be extensible for future operations (ReadMany, +cross-partition query, change feed) without requiring a redesign. + +**Ordering semantics:** Cross-partition `SELECT *` drains partitions in EPK order as an +implementation behavior. Within each partition, items are returned in ascending RID order — +the natural sort order of `SELECT *`. The combined output is therefore ascending by +`(EffectivePartitionKey, RID)`. This is a driver-emitted ordering, **not** a service-level +ordering guarantee. The service does not guarantee global cross-partition order without +explicit `ORDER BY`. Single-partition queries (targeted via +`OperationTarget::PartitionKey`) preserve whatever order the server returns, including +`ORDER BY` results. --- @@ -126,7 +145,7 @@ flowchart TB subgraph Driver["CosmosDriver"] direction TB - Planner["Planner
──────────
Input: CosmosOperation + OperationOptions
Output: OperationPlan

• Determines targeting (point PK, FeedRange, full key space)
• ReadAll: resolves PK ranges → SequentialDrain over Request nodes
• Single-partition ops: single-node plan
• Point ops: trivial single-node plan"] + Planner["Planner
──────────
Input: CosmosOperation + OperationOptions
Output: OperationPlan

• Determines targeting (point PK, FeedRange, full key space)
• Cross-partition SELECT *: resolves PK ranges → SequentialDrain over Request nodes
• Single-partition ops: single-node plan
• Point ops: trivial single-node plan"] Executor["PlanExecutor
──────────
Input: OperationPlan
Output: CosmosResponse (single page)

• Executes one Request node per call
• Handles partition splits / merges (Request re-resolves EPK → PK)
• Collects node-level diagnostics
• Builds continuation token if more pages remain"] @@ -212,15 +231,22 @@ would break `Copy` and mix operation semantics with operation payload — we spl /// Each variant carries exactly the data needed for its operation type. #[derive(Clone, Debug)] pub enum OperationPayload { - /// No payload needed (e.g., ReadItem, DeleteItem, ReadContainer, ReadAllItems). + /// No payload needed (e.g., ReadItem, DeleteItem, ReadContainer). None, /// Raw body bytes (e.g., CreateItem, UpsertItem, ReplaceItem). /// The caller provides pre-serialized JSON. Body(Vec), + /// A SQL query against documents (`SELECT * [WHERE …]`, etc.). + /// The driver wraps this in the `application/query+json` envelope on + /// the wire; the caller does not pre-serialize it. + Query { + query: String, + parameters: Vec, + }, + // Future variants: - // Query { query: String, parameters: Option> }, // ReadMany { items: Vec<(String, PartitionKey)> }, // ChangeFeed { mode, start_from, ... }, } @@ -255,12 +281,25 @@ pub struct CosmosOperation { ### 3.2 OperationTarget Partition targeting is currently a single `Option` field. Feed operations require -richer targeting. The targeting enum has three modes: no partition scope, a specific logical -partition key (needed for point reads where the raw partition key value must be sent to the -backend), or an EPK range for feed operations spanning one or more partitions. +richer targeting. The targeting enum has three **mutually exclusive** modes: no partition scope, +a specific logical partition key, or an EPK range. An operation chooses exactly one of these — +it never combines a logical partition key with a feed range. + +| Variant | When the SDK picks it | What the driver does | +|---------|----------------------|----------------------| +| `None` | Account-/database-level operations (`CreateDatabase`, `ReadContainer`). | No PK routing; no EPK headers. | +| `PartitionKey(pk)` | Point operations and any single-partition feed operation (queries, change feed scoped to one logical partition). | Sends the raw PK header (`x-ms-documentdb-partitionkey`); routes to that PK's owning physical partition. **Bypasses query plan** for SQL queries — see [§5.2](#52-planning-logic-by-operation-type). | +| `FeedRange(fr)` | Cross-partition feed operations, including the default "whole container" case via `OperationTarget::all_ranges()`. | Resolves the FeedRange to one or more PK range IDs via `PartitionKeyRangeCache`; sets `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` headers per fetch. | + +Logical partition key targeting and feed range targeting are mutually exclusive at the type +level. A caller that wants to target a single logical partition uses +`OperationTarget::PartitionKey`. A caller that wants to target a slice of EPK space (one +physical partition, several adjacent ones, or the whole container) uses +`OperationTarget::FeedRange`. The SDK surface enforces the same exclusivity (see +[§10.6 SDK Option Plumbing](#106-sdk-option-plumbing)). ```rust -/// How the operation is targeted to partitions. +/// How the operation is targeted to partitions. Variants are mutually exclusive. #[derive(Clone, Debug)] pub enum OperationTarget { /// No partition targeting (account-level or database-level operations, @@ -269,17 +308,20 @@ pub enum OperationTarget { /// Target a specific logical partition key. /// - /// Used for point operations (read, create, delete, upsert, replace) - /// and single-partition feed operations where the raw partition key - /// value must be included in the request headers. + /// Used for point operations (read, create, delete, upsert, replace) and + /// for single-partition feed operations (queries scoped to one logical + /// partition, single-partition change feed, etc.). The raw partition key + /// value is included in request headers and the request goes straight to + /// the gateway for the owning physical partition. No FeedRange / EPK + /// header is set. PartitionKey(PartitionKey), /// Target a specific feed range. /// - /// Used for feed operations that span one or more partitions. - /// Uses the `FeedRange` type, which represents a contiguous span - /// of effective partition key (EPK) space. See §3.2.1 below for - /// the type's origin. + /// Used for feed operations that span one or more partitions. Uses the + /// `FeedRange` type, which represents a contiguous span of effective + /// partition key (EPK) space. See §3.2.1 below for the type's origin. + /// Use `OperationTarget::all_ranges()` for the whole container. /// /// The pipeline resolves the FeedRange to the owning PK range ID(s) via /// the `PartitionKeyRangeCache` at execution time. @@ -347,13 +389,28 @@ impl CosmosOperation { // Caller attaches body via .with_payload(OperationPayload::Body(...)) } - /// Reads all items across all partitions. - pub fn read_all_items(container: ContainerReference) -> Self { + /// Runs a SQL query (`SELECT * [WHERE …]`, etc.). + /// + /// Without an explicit `with_target(...)`, the query targets the entire + /// container (`OperationTarget::all_ranges()`). To scope the query to a + /// single logical partition (which unlocks `ORDER BY` and other clauses + /// without a query plan — see §5.2), call + /// `.with_target(OperationTarget::PartitionKey(pk))`. To scope it to a + /// specific FeedRange, call `.with_target(OperationTarget::FeedRange(fr))`. + pub fn query( + container: ContainerReference, + query: impl Into, + parameters: Vec, + ) -> Self { let resource_ref = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new(OperationType::ReadFeed, resource_ref) + Self::new(OperationType::Query, resource_ref) .with_target(OperationTarget::all_ranges()) + .with_payload(OperationPayload::Query { + query: query.into(), + parameters, + }) } } ``` @@ -437,7 +494,7 @@ pub(crate) enum OperationPlan { /// A multi-node plan stored as a flat list of nodes. /// Nodes are stored bottom-up: children appear before parents. - /// Used for cross-partition feed operations (e.g., ReadAll). + /// Used for cross-partition feed operations (e.g., a cross-partition `SELECT *` query). Graph { /// The flat list of nodes. Children appear before parents. nodes: Vec, @@ -507,7 +564,7 @@ The flat node list is always built **bottom-up**: leaf nodes (Request) are pushe then their parent (SequentialDrain) is pushed after them. This produces a deterministic layout where `NodeId` values are stable for a given set of inputs. -For a ReadAll plan over 3 partitions, the node list looks like: +For a cross-partition `SELECT *` plan over 3 partitions, the node list looks like: ```text Index Node @@ -534,7 +591,7 @@ SingleNode(Request { operation: read_item, feed_range: pk_epk, continuation: Non A `SingleNode` plan with one `Request` node. The executor runs it directly, gets a `CosmosResponse`, done. No heap allocation. -#### ReadAll (Cross-Partition) +#### Cross-Partition `SELECT *` Query ```text Graph { @@ -558,7 +615,7 @@ partition. When a partition is fully drained (server returns no continuation), t call starts the next partition. A continuation token is returned after each page until all partitions are exhausted. -#### ReadAll — Resumed from Continuation +#### Cross-Partition `SELECT *` — Resumed from Continuation When resuming from a continuation token that says "active range is `["55","AA")` with server token `xyz`", the Planner skips already-drained ranges and rebuilds the plan @@ -593,8 +650,9 @@ direct `execute_single_operation` call. The `OperationPlan::SingleNode` variant ### 5.1 Responsibilities -The Planner transforms a `CosmosOperation` into an `OperationPlan`. For ReadAll, this is -synchronous: resolve partition key ranges and build a `SequentialDrain` node over `Request` children. +The Planner transforms a `CosmosOperation` into an `OperationPlan`. For a cross-partition +`SELECT *` query, this is synchronous: resolve partition key ranges and build a +`SequentialDrain` node over `Request` children. ```rust pub(crate) struct Planner<'a> { @@ -606,7 +664,7 @@ impl<'a> Planner<'a> { /// Creates an operation plan from a CosmosOperation. /// /// For point operations, this is synchronous and trivial. - /// For ReadAll, this resolves PK ranges and builds a SequentialDrain plan. + /// For cross-partition `SELECT *`, this resolves PK ranges and builds a SequentialDrain plan. pub async fn plan( &self, operation: &CosmosOperation, @@ -626,8 +684,42 @@ impl<'a> Planner<'a> { |-----------|-----------|---------------| | ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Request` node. SingleNode. | | CreateDatabase, ReadContainer, etc. | `None` | Single `Request` node. SingleNode. | -| ReadAllItems (single partition) | `PartitionKey` | Single `Request` node. Paginated. | -| ReadAllItems (cross-partition) | `FeedRange` (`all_ranges()`) | Resolve PK ranges → `SequentialDrain` over N `Request` nodes. Sequential. | +| `SELECT * [WHERE …]`, single partition | `PartitionKey` | Single `Request` node. Paginated. **Fast-path: no query plan fetch.** See [§5.2.1](#521-single-partition-query-fast-path). | +| Single-partition query with `ORDER BY` / `GROUP BY` / aggregates / `OFFSET LIMIT` / etc. | `PartitionKey` | Same as above — pass through verbatim. | +| `SELECT * [WHERE …]`, cross-partition | `FeedRange` (`all_ranges()` or a caller-supplied range) | Resolve PK ranges → `SequentialDrain` over N `Request` nodes. Sequential. No query plan needed for `SELECT * [WHERE …]`. | +| Cross-partition query with `ORDER BY` / aggregates / vector / hybrid | `FeedRange` | **Out of scope for this spec.** Requires backend query plan retrieval — see [§12.2](#122-cross-partition-queries). | + +#### 5.2.1 Single-Partition Query Fast-Path + +When an operation has `OperationTarget::PartitionKey(pk)` *and* an `OperationPayload::Query`, +the Planner produces a trivial `SingleNode` plan and the executor sends the request directly +to the gateway against the owning physical partition. **No query plan is fetched** and **no +client-side rewriting** is performed: + +- The query body is forwarded as-is in the `application/query+json` envelope. +- Arbitrary single-partition SQL is supported, including `ORDER BY`, `GROUP BY`, `DISTINCT`, + aggregates (`COUNT`, `SUM`, …), `OFFSET / LIMIT`, and `TOP` — the gateway evaluates them + inside the single physical partition and the result page is correct as returned. +- Vector search (`VectorDistance`) and hybrid search clauses are *also* accepted on this + path today because they collapse to a single-partition execution. They produce correct + results when the entire vector / hybrid evaluation fits in one partition, but see the + caveat below. +- The continuation token, if any, is the server's opaque continuation for that one partition + (a `ResumeState::Request`). + +**Why this is safe.** A query whose data set is bounded to a single logical partition is +already evaluated in a single backend execution context. Aggregates and ordering operators +are correct without a client-side merge step, so the driver does not need a query plan to +drive correctness. + +**Future change — query plan fetched even for single-partition queries.** Vector and hybrid +queries can become incorrect on the single-partition fast-path in edge cases (e.g., the +backend returning per-partition truncated candidate lists where the global merge requires +the query plan's score-rewriting hints). To keep the fast-path correct as new query +features ship, the driver will eventually start **fetching a query plan** for +single-partition queries too. The plan will be cached, and for queries the plan classifies +as "passthrough" the execution path is unchanged. This is a future change and is not +required to ship the in-scope `SELECT * [WHERE …]` work. ### 5.3 Pseudo-Code: Building a Trivial Plan @@ -648,10 +740,10 @@ fn plan_trivial(operation: CosmosOperation, options: OperationOptions) -> Operat No PK range resolution is needed. The operation is wrapped in a single `Request` node. -### 5.4 Pseudo-Code: Building a ReadFeed Plan +### 5.4 Pseudo-Code: Building a Cross-Partition `SELECT *` Plan -The following pseudo-code illustrates how the Planner constructs a cross-partition ReadAll -plan, including resume from a continuation token: +The following pseudo-code illustrates how the Planner constructs a cross-partition +`SELECT *` plan, including resume from a continuation token: ```rust // PSEUDO-CODE — illustrative, not compilable @@ -733,8 +825,8 @@ The Planner architecture supports future operations without redesign: - **Change feed**: Create `Request` nodes scoped to feed ranges with change-feed-specific continuation state. Add a parent merge node based on change-feed merge semantics. - **Concurrency management**: All plan nodes receive a **concurrency permit** (semaphore - token) during execution. For ReadAll, the executor holds a single permit — sequential - by design. Future operations (ReadMany, cross-partition queries) will acquire multiple + token) during execution. For a cross-partition `SELECT *`, the executor holds a single + permit — sequential by design. Future operations (ReadMany, cross-partition queries) will acquire multiple permits from a shared semaphore, allowing the PlanExecutor to control the degree of parallelism across nodes without changing the plan model. @@ -831,6 +923,126 @@ async fn execute_node( - **Cancellation mid-page**: If the caller cancels during a page fetch, the continuation token from the *previous* completed call remains valid for resumption. +### 6.3 OperationOverrides + +A `CosmosOperation` represents the **stable, fetch-independent** part of a Cosmos request: +operation type, resource reference, partition targeting, payload, and any caller-supplied +headers. For a feed operation, the same `CosmosOperation` is reused across every page and +every EPK range — only a small set of headers and parameters differ from one fetch to the +next. + +To avoid cloning the full `CosmosOperation` per fetch (and to avoid holding +`PlanNode::Request` open after a single fetch is done), the executor passes an +`OperationOverrides` struct to `execute_single_operation` alongside an +`&CosmosOperation` reference. Each invocation produces a fresh request by composing the +shared base operation with the per-fetch overrides. + +```rust +/// Per-fetch overrides applied on top of a shared `CosmosOperation`. +/// +/// Strictly limited to the headers / parameters that legitimately differ +/// between successive fetches against the same logical operation. Anything +/// not on this list belongs on the `CosmosOperation` itself. +#[derive(Clone, Debug, Default)] +pub struct OperationOverrides { + /// EPK range the request is scoped to. When set, the transport layer + /// emits `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` headers + /// and routes to the PK range(s) currently owning that EPK slice. + /// + /// Only valid when the base operation's target is `OperationTarget::FeedRange` + /// or `OperationTarget::all_ranges()`. Ignored for `OperationTarget::PartitionKey` + /// (a logical PK already pins the request to one physical partition). + pub feed_range: Option, + + /// Server-provided continuation token from the previous page of the + /// same fetch loop. Emitted as `x-ms-continuation`. `None` for the + /// first page. + pub continuation: Option, + + /// Per-fetch override for the maximum item count hint + /// (`x-ms-max-item-count`). Falls back to the value carried by + /// `OperationOptions::max_item_count` when unset. + pub max_item_count: Option, +} +``` + +The `execute_single_operation` entry point therefore becomes: + +```rust +pub(crate) async fn execute_single_operation( + &self, + operation: &CosmosOperation, + options: &OperationOptions, + overrides: &OperationOverrides, + diagnostics: &mut DiagnosticsContextBuilder, +) -> azure_core::Result { + // Apply overrides to the request being built from `operation`. + // Run the existing pipeline (region failover, session, retry, transport). +} +``` + +#### What overrides MAY carry + +The set is deliberately small and frozen by this spec: + +| Field | Purpose | Header / wire effect | +|-------|---------|----------------------| +| `feed_range` | Per-fetch EPK targeting (split / merge handling, drain progression) | `x-ms-documentdb-epk-min`, `x-ms-documentdb-epk-max`, PK-range routing | +| `continuation` | Resume the same partition mid-stream | `x-ms-continuation` | +| `max_item_count` | Per-fetch page-size hint | `x-ms-max-item-count` | + +#### What overrides MUST NOT carry + +To keep `OperationOverrides` predictable and cheap to validate, it explicitly does NOT +carry anything that changes operation identity, semantics, or auth. The following stay on +the base `CosmosOperation` (or on `OperationOptions`) and are an error to put on overrides: + +- Operation type, resource type, resource reference. +- Partition key value (the logical PK is part of the operation's target). +- Request body / payload (`OperationPayload`). +- Consistency level, session token, throughput control group, retry policy. +- Authentication or any other header that affects request signing. + +#### Plan-node integration + +`PlanNode::Request` stores the per-fetch *intent* (the EPK range that this leaf is +responsible for, plus any server continuation it was resumed with). At execution time, the +executor materializes that into an `OperationOverrides` and runs: + +```rust +// PSEUDO-CODE +async fn execute_request_node( + node: &PlanNode, // PlanNode::Request + driver_context: &DriverContext, + diagnostics: &mut DiagnosticsContextBuilder, +) -> Result { + let PlanNode::Request { operation, options, feed_range, continuation } = node else { + unreachable!() + }; + let overrides = OperationOverrides { + feed_range: Some(feed_range.clone()), + continuation: continuation.clone(), + max_item_count: options.max_item_count, + }; + driver_context + .execute_single_operation(operation.as_ref(), options, &overrides, diagnostics) + .await +} +``` + +Because `operation` is an `Arc` shared across every Request leaf in the +plan (see §5.4), and because the executor only synthesizes a tiny `OperationOverrides` +per fetch, the same `CosmosOperation` can drive an arbitrary number of EPK-range fetches +without being cloned, mutated, or re-built. The base operation outlives the entire feed +operation; overrides are scratch state owned by a single fetch and thrown away after the +response is returned. + +This is also what makes splits / merges cheap: when the Request leaf re-resolves its EPK +range to new PK range IDs (see [§9.1](#91-partition-split-during-execution)), it issues +follow-up calls to `execute_single_operation` against the **same** `CosmosOperation`, +varying only the `feed_range` (and where applicable the `continuation`) inside the +`OperationOverrides`. + --- ## 7. Continuation Tokens @@ -858,7 +1070,8 @@ Continuation tokens must be: higher version. The SDK only emits the latest version when no input token is provided. 3. **Aim for O(1) size** — Token size should ideally be constant regardless of partition - count. For ReadAll, only the state of the currently-active partition is stored, and other + count. For cross-partition `SELECT *`, only the state of the currently-active partition is + stored, and other partitions' positions are reconstructed from EPK bounds on resume. However, per-partition state MAY become necessary for certain node types (e.g., change feed requires per-range tokens). It is up to each node type to define its own resume state and thus determine @@ -981,7 +1194,7 @@ struct RequestState { |-----------|-------|----------|---------| | `ContinuationTokenInner` | `version` | `version` | Format version (integer) | | | `container_rid` | `containerRid` | Container RID (string) | -| | `operation_kind` | `operationKind` | Operation kind (e.g., `"readAll"`) | +| | `operation_kind` | `operationKind` | Operation kind (e.g., `"query"`) | | | `resume` | `resume` | `ResumeState` (tagged union) | | `SequentialDrainState` | *(tag)* | `type` | `"sequentialDrain"` | | | `epk_min` | `epkMin` | EPK min inclusive (hex string) | @@ -1054,14 +1267,14 @@ impl FromStr for ContinuationToken { #### Sample Tokens -**ReadAll, mid-stream on partition ["55","AA")** +**Cross-partition `SELECT *`, mid-stream on partition ["55","AA")** JSON (before base64 encoding): ```json { "version": 1, "containerRid": "dbs/abc/colls/def", - "operationKind": "readAll", + "operationKind": "query", "resume": { "type": "sequentialDrain", "epkMin": "55", @@ -1074,13 +1287,13 @@ JSON (before base64 encoding): On resume, the Planner sees the drain cursor at `["55","AA")`. Ranges with max ≤ `"55"` are skipped. The range `["55","AA")` resumes from `serverToken`. Ranges after `"AA"` start fresh. -**ReadAll, target partition just completed (cursor at boundary)** +**Cross-partition `SELECT *`, target partition just completed (cursor at boundary)** ```json { "version": 1, "containerRid": "dbs/abc/colls/def", - "operationKind": "readAll", + "operationKind": "query", "resume": { "type": "sequentialDrain", "epkMin": "55", @@ -1100,7 +1313,7 @@ A bare `RequestState` at the root (no wrapping layer): { "version": 1, "containerRid": "dbs/abc/colls/def", - "operationKind": "readAll", + "operationKind": "query", "resume": { "type": "request", "epkMin": "55", @@ -1118,7 +1331,7 @@ A continuation token is **invalidated** by: 2. **Token version mismatch** — A token produced by a newer SDK version may not be readable by an older version. Newer SDKs MUST support tokens from older versions (backward compat). 3. **Operation kind mismatch** — The token's `operationKind` must match the operation being - resumed. A `readAll` token cannot be used with a query operation. + resumed. A `query` token cannot be used with a query operation. 4. **Structure mismatch** — If the re-created plan produces a different node type than the token's `ResumeState` variant (e.g., a `drain` token for a single-partition operation), the token is rejected. @@ -1491,7 +1704,7 @@ impl CosmosDriver { /// For point operations (read, create, delete, etc.), this returns the /// single response with no continuation token. /// - /// For feed operations (read-all), this executes one page of the plan + /// For feed operations (queries), this executes one page of the plan /// and returns the result. If more pages are available, the response /// includes a `ContinuationToken`. The caller passes this token back /// in `OperationOptions` to fetch the next page. @@ -1508,9 +1721,10 @@ impl CosmosDriver { ### 10.2 CosmosResponse Changes `CosmosResponse` gains an optional continuation token, and its `body` field becomes a -`ResponseBody` enum to support both single-document responses (point operations) and -multi-document responses (feed operations) without forcing every caller to parse a feed -envelope: +`ResponseBody` enum to support both unparsed response bodies (point operations and +single-page feeds the driver passes through verbatim) and pre-parsed item lists +(feeds the driver had to aggregate or whose envelopes it had to crack open), +without forcing every caller to parse a feed envelope: ```rust /// The body of a Cosmos DB response. @@ -1523,16 +1737,21 @@ pub enum ResponseBody { /// No body (e.g., 204 No Content). None, - /// A single document body — raw serialized bytes. - /// Used for point operations (read, create, upsert, replace) and for - /// resource reads (database, container). - Single(Vec), + /// A response body the driver did not need to parse — raw serialized bytes. + /// Used for any operation where the driver passes the server response through + /// verbatim. Depending on the operation, the caller (the SDK) knows whether + /// these bytes represent a single item (point reads, create/upsert/replace, + /// resource reads like database/container) or a page of feed data (feed + /// operations whose envelope the driver did not need to crack open). + Bytes(Vec), /// A list of document bodies — one entry per item, each entry being /// the raw serialized bytes of one item. - /// Used for feed operations (ReadAll, future query/read-many). - /// The driver parses the response envelope to split items into a - /// `Vec>` but does not deserialize the items themselves. + /// Used for feed operations (queries, future read-many) when the + /// driver had to aggregate results across partitions or otherwise parse + /// the feed envelope. Exists so the driver does not have to re-serialize + /// the parsed items just to hand them back to the SDK. The driver does + /// not deserialize the items themselves. Items(Vec>), } @@ -1595,7 +1814,7 @@ pub struct OperationOptions { /// even if that group exceeds `max_item_count`. The most prominent case /// is the change feed, where all documents sharing the same LSN /// (logical sequence number) are returned in the same page to preserve - /// atomicity. ReadAll does not have this constraint today, but the + /// atomicity. `SELECT *` queries do not have this constraint today, but the /// contract is the same: callers MUST treat `max_item_count` as a hint, /// not a hard cap. /// @@ -1614,12 +1833,13 @@ These fields are ignored for point operations. | Operation | Order Guarantee | |-----------|-----------------| -| ReadAll (single partition) | (PartitionKey, ID) ascending. | -| ReadAll (cross-partition) | Within each partition, (PartitionKey, ID) ascending. Across partitions, items are yielded in EPK order (implementation behavior, not a service guarantee). | +| `SELECT *` (single partition) | (PartitionKey, ID) ascending. | +| `SELECT *` (cross-partition) | Within each partition, (PartitionKey, ID) ascending. Across partitions, items are yielded in EPK order (implementation behavior, not a service guarantee). | ### 10.5 Page Boundaries -Each `execute_operation` call for ReadAll returns exactly one page from exactly one partition: +Each `execute_operation` call for a cross-partition `SELECT *` returns exactly one page from +exactly one partition: - **Server-side max item count**: The server may return fewer items than requested. - **Client-side max item count**: Configurable via `OperationOptions::max_item_count`. @@ -1633,6 +1853,167 @@ Each `execute_operation` call for ReadAll returns exactly one page from exactly Pages never span partition boundaries. +### 10.6 SDK Option Plumbing + +The driver-level `OperationOptions` type is **not** what the SDK exposes to user code +verbatim. Each public SDK method has its own options struct whose fields are a curated +subset (and occasionally a superset) of the driver's `OperationOptions` and the +operation-specific knobs. + +`OperationTarget` is the one driver type that the SDK **re-exports verbatim** (the same +way the SDK already re-exports `PartitionKey`, `FeedRange`, consistency-level enums, etc. +from the driver). Because the variants are mutually exclusive at the type level, the SDK +takes a single `target` argument on its feed-operation methods rather than a pair of +optional `partition_key` / `feed_range` fields. This pushes the "exactly one of" +invariant down to the type system and removes a runtime validation step. + +#### Constructors for `OperationTarget` + +`OperationTarget` exposes named factories rather than `From` impls. Callers explicitly +say which targeting mode they want, but they do not have to wrap their argument in the +enum variant by hand: + +```rust +impl OperationTarget { + /// Targets a single logical partition key. + pub fn partition(pk: impl Into) -> Self { + OperationTarget::PartitionKey(pk.into()) + } + + /// Targets a specific feed range (one physical partition, several adjacent ones, + /// etc.). Use `OperationTarget::all_ranges()` for the whole container. + pub fn feed_range(fr: impl Into) -> Self { + OperationTarget::FeedRange(fr.into()) + } + + /// The full key space: targets all partition key ranges. + pub fn all_ranges() -> Self { + OperationTarget::FeedRange(FeedRange::all_ranges()) + } +} +``` + +There are deliberately no `From` / `From` impls — picking +between a single-partition and a feed-range query is a real decision and should be +visible at the call site, not silently inferred from a parameter type. + +#### Method signature change for `query_items` + +Today `query_items` takes a `partition_key: impl Into` (or +similar) parameter. As part of this work it changes to take a single +`target: OperationTarget` parameter. The old `partition_key` parameter is removed +entirely — there is no compatibility shim, since this lands together with the broader +feed-operations refactor: + +```rust +impl ContainerClient { + pub async fn query_items( + &self, + query: impl Into, + parameters: impl IntoIterator, + target: OperationTarget, + options: Option, + ) -> azure_core::Result>>; +} +``` + +Call-site shapes: + +```rust +// Single logical partition (was: .partition_key(pk)) +container.query_items(sql, [], OperationTarget::partition(pk), None).await?; + +// Specific FeedRange (e.g., from a previous split-aware iteration) +container.query_items(sql, [], OperationTarget::feed_range(fr), None).await?; + +// Whole container — the most common case +container.query_items(sql, [], OperationTarget::all_ranges(), None).await?; +``` + +`target` is a required positional argument rather than an option on `QueryOptions`, +because every query has to declare its scope and we want that decision visible at the +call site. Callers that want "the whole container" pass `OperationTarget::all_ranges()` +explicitly. `QueryOptions` therefore does **not** carry a `target` field: + +```rust +// Re-exported from the driver. +pub use azure_data_cosmos_driver::OperationTarget; + +#[derive(Clone, Debug, Default)] +#[non_exhaustive] +pub struct QueryOptions { + /// Page-size hint forwarded as `x-ms-max-item-count`. + pub max_item_count: Option, + + /// Continuation token from a prior `FeedPage::continuation_token()`. + pub continuation: Option, + + /// Standard cross-cutting knobs (consistency, session, retry, timeout, etc.). + pub consistency_level: Option, + pub session_token: Option, + // ... +} +``` + +The flow, for queries specifically, is: + +```text +user code + │ query, parameters, target: OperationTarget, + │ QueryOptions { max_item_count?, continuation?, + │ consistency_level?, session_token?, ... } + ▼ +ContainerClient::query_items(query, parameters, target, options) + │ builds: + │ op = CosmosOperation::query(container, query, parameters) + │ .with_target(target) + │ opts = OperationOptions { + │ max_item_count: options.max_item_count, + │ continuation: options.continuation, + │ consistency_level, session_token, retry, timeout, ... + │ } + ▼ +CosmosDriver::execute_operation(op, opts) + │ Planner → OperationPlan + │ PlanExecutor walks the plan, per fetch builds an OperationOverrides + │ and calls execute_single_operation(&op, &opts, &overrides, ...) + ▼ +CosmosResponse { body, headers, status, diagnostics, continuation_token } + │ SDK converts ResponseBody::Items into FeedPage via T: DeserializeOwned + ▼ +user code +``` + +Two consequences of this layering: + +1. **Mutual exclusivity is a property of the type, not a runtime check.** A + `target: OperationTarget` parameter cannot express "both a partition key and a feed + range" — the user picks one constructor (`OperationTarget::partition(pk)`, + `OperationTarget::feed_range(fr)`, or `OperationTarget::all_ranges()`). The SDK does + not need a runtime guard, and the driver-side `OperationTarget` enum carries the + same guarantee all the way down. + +2. **Options that the driver ignores for a given operation are still allowed at the SDK + layer.** For example, `max_item_count` on a point-read SDK call is silently dropped by + the driver (point ops produce one response). This keeps the SDK option structs ergonomic + and consistent, with the driver as the single point that decides which knobs are + meaningful for which operation. + +#### Per-operation SDK option structs (sketch) + +| SDK method | Options struct | Notable fields it injects into `CosmosOperation` / `OperationOptions` | +|------------|----------------|----------------------------------------------------------------------| +| `read_item` | `ReadItemOptions` | consistency, session, retry → `OperationOptions` | +| `create_item` / `upsert_item` / `replace_item` | `ItemOptions` | indexing directive, pre/post triggers → request headers; consistency → options | +| `delete_item` | `DeleteItemOptions` | consistency → options | +| `query_items` | `QueryOptions` (+ required `target: OperationTarget` positional arg) | `target` → `OperationTarget`; `max_item_count`, `continuation`, consistency → `OperationOptions` | + +The SDK does not expose `OperationOverrides` to user code at all — it is purely an +internal type used by the driver to thread per-fetch state through +`execute_single_operation`. Users control per-fetch behavior indirectly: they set +`max_item_count` once on the SDK options struct, and the driver applies it to every +fetch unless a specific plan node has a reason to override (none today). + --- ## 11. Testing Strategy @@ -1642,8 +2023,8 @@ Pages never span partition boundaries. | Test Area | Cases | |-----------|-------| | Planner — point ops | Verify SingleNode plan for each point operation type. | -| Planner — ReadAll | Verify Graph plan with SequentialDrain root, correct Request children per PK range. | -| Planner — ReadAll resume | Verify resume skips drained partitions, resumes active, starts right fresh. | +| Planner — cross-partition `SELECT *` | Verify Graph plan with SequentialDrain root, correct Request children per PK range. | +| Planner — cross-partition `SELECT *` resume | Verify resume skips drained partitions, resumes active, starts right fresh. | | Planner — bottom-up invariant | Verify children always have lower NodeIds than parents. | | PlanExecutor — single node | Execute SingleNode plan, verify result matches direct pipeline call. | | PlanExecutor — drain | Execute SequentialDrain plan with mock pipeline, verify sequential execution. | @@ -1659,6 +2040,11 @@ Pages never span partition boundaries. | ContinuationToken — unknown variant | Unknown `ResumeState` type fails gracefully on deserialize. | | NodeId/NodeRange | Verify range iteration, length, empty checks. | | OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `FeedRange` produce correct targets. | +| OperationTarget — mutual exclusivity | SDK rejects requests that supply both `partition_key` and `feed_range`. | +| OperationOverrides — feed_range / continuation / max_item_count | Verify overrides translate to `x-ms-documentdb-epk-min/max`, `x-ms-continuation`, `x-ms-max-item-count` headers. | +| OperationOverrides — base op reuse | One `Arc` drives multiple fetches with distinct overrides; base op is never cloned. | +| Single-partition query fast-path | `OperationTarget::PartitionKey` + `OperationPayload::Query` produces a SingleNode plan, no PK range cache lookup, no query-plan fetch. | +| Single-partition query — ORDER BY / aggregates | Verify `ORDER BY`, `GROUP BY`, `COUNT(*)` queries are forwarded verbatim and pass through. | | Diagnostics — hierarchy | Verify recursive node tree structure appears in diagnostics JSON. | | Diagnostics — children | Verify composite nodes contain child node diagnostics. | | Diagnostics — backward compat | Verify `requests()` flattening returns all requests from nested nodes. | @@ -1667,15 +2053,16 @@ Pages never span partition boundaries. | Test Area | Cases | |-----------|-------| -| ReadAll — basic | Read all items from a container, verify all returned in EPK order. | -| ReadAll — empty container | ReadAll on empty container returns no results, no continuation. | -| ReadAll — single partition | All items in one partition, verify SingleNode plan execution. | -| ReadAll — multi partition | Items across multiple partitions, verify sequential drain. | -| ReadAll — pagination | Verify continuation token threads correctly across pages. | -| ReadAll — resume | Get continuation mid-stream, resume from it, verify continued results. | -| ReadAll — resume across SDK versions | Serialize token, deserialize with newer SDK, verify resume works. | -| ReadAll — partition split | Trigger split during ReadAll, verify Request node re-resolves and completes. | -| ReadAll — large dataset | Read many items, verify all pages and partitions are drained. | +| `SELECT *` — basic | Read all items from a container, verify all returned in EPK order. | +| `SELECT *` — empty container | `SELECT *` on empty container returns no results, no continuation. | +| `SELECT *` — single partition | All items in one partition, verify SingleNode plan execution. | +| `SELECT *` — multi partition | Items across multiple partitions, verify sequential drain. | +| `SELECT *` — pagination | Verify continuation token threads correctly across pages. | +| `SELECT *` — resume | Get continuation mid-stream, resume from it, verify continued results. | +| `SELECT *` — resume across SDK versions | Serialize token, deserialize with newer SDK, verify resume works. | +| `SELECT *` — partition split | Trigger split during cross-partition `SELECT *`, verify Request node re-resolves and completes. | +| `SELECT *` — large dataset | Read many items, verify all pages and partitions are drained. | +| `SELECT * WHERE` — server-side filter | Verify `WHERE` predicate is applied server-side, only matching items returned. | | Diagnostics — RU aggregation | Verify total RU charge sums across all pages. | | Diagnostics — plan structure | Verify diagnostics JSON shows SequentialDrain/Request hierarchy with children. | @@ -1684,7 +2071,7 @@ Pages never span partition boundaries. | Test Area | Metric | |-----------|--------| | Point op overhead | Latency regression < 1% vs. direct `execute_single_operation`. | -| ReadAll latency | Sequential partition drain does not introduce unnecessary overhead. | +| Cross-partition `SELECT *` latency | Sequential partition drain does not introduce unnecessary overhead. | --- @@ -1699,17 +2086,24 @@ node. This adds concurrency control (semaphore-based) to the PlanExecutor and a ### 12.2 Cross-Partition Queries -Cross-partition queries require fetching a backend query plan, creating `Request` nodes per -partition, and optionally performing client-side sort for ORDER BY queries via an -`OrderedMerge` node. This adds query plan fetching callbacks to the Planner and k-way -merge logic to the PlanExecutor. +Cross-partition queries with `ORDER BY`, `GROUP BY`, aggregates (`COUNT`, `SUM`, …), +`OFFSET / LIMIT`, vector search, or hybrid search require fetching a backend query plan, +creating `Request` nodes per partition, and optionally performing client-side sort / merge +via an `OrderedMerge` node. This adds query-plan fetching callbacks to the Planner and +k-way merge logic to the PlanExecutor. + +The same query-plan path is also planned to back single-partition vector / hybrid queries +(see [§5.2.1](#521-single-partition-query-fast-path)) so the driver can apply score +normalization and other plan-driven hints uniformly. For non-vector single-partition +queries the fast-path remains. ### 12.3 Change Feed The change feed is a specialized feed operation with unique characteristics: start-from modes, lease-based partition assignment, and incremental/full-fidelity modes. -Unlike ReadAll's sequential drain (where only the active partition's state is needed), +Unlike a cross-partition `SELECT *`'s sequential drain (where only the active partition's +state is needed), change feed requires **per-range continuation tokens**. Each feed range maintains its own server continuation, and the resume state is a list of per-range tokens: @@ -1756,3 +2150,14 @@ is a performance optimization, not a correctness concern. The existing hedging mechanism (speculative execution in secondary regions) could be extended to individual plan nodes, allowing feed fetches to hedge independently. + +### 12.7 Dedicated `ReadAllItems` Convenience Operation + +Today the unfiltered "read every document in the container" case is expressed as a +`SELECT * FROM c` query. A future revision may add a dedicated `read_all_items` SDK method +and a corresponding `CosmosOperation::read_all_items(...)` factory backed by the existing +`OperationType::ReadFeed` (point-read-feed) wire path. That path avoids the +`application/query+json` envelope and reads at the gateway as a feed read rather than a +query, which can be cheaper RU-wise on large containers. The plan model and continuation +token format above already accommodate this — only the payload variant and the chosen +wire shape differ — so this is purely additive. From 3d32bbbe70281cbecd8d944504262ec7db592af2 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Tue, 5 May 2026 18:29:16 +0000 Subject: [PATCH 11/29] Plan implementation of feed operations spec --- .../docs/FEED_OPERATIONS_SPEC.md | 658 +++++++++++++++++- 1 file changed, 657 insertions(+), 1 deletion(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md index 753b63b80c9..044e3b035b9 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md @@ -21,6 +21,7 @@ 10. [API Semantics & Invariants](#10-api-semantics--invariants) 11. [Testing Strategy](#11-testing-strategy) 12. [Future Work](#12-future-work) +13. [Implementation Plan](#13-implementation-plan) --- @@ -2036,7 +2037,7 @@ fetch unless a specific plan node has a reason to override (none today). | ContinuationToken — operation kind | Token with wrong operation kind is rejected. | | ContinuationToken — split recovery | Token with EPK bounds spanning a split range maps to correct child ranges. | | ContinuationToken — SequentialDrain resume | SequentialDrain node correctly classifies partitions as left/target/right. | -| ContinuationToken — nesting | Nested tokens round-trip correctly through serialize/deserialize. | +| ContinuationToken — nesting | Nested tokens serialize to the expected exact string and parse back from a fixed input (no round-trip tests; see §13.0). | | ContinuationToken — unknown variant | Unknown `ResumeState` type fails gracefully on deserialize. | | NodeId/NodeRange | Verify range iteration, length, empty checks. | | OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `FeedRange` produce correct targets. | @@ -2161,3 +2162,658 @@ and a corresponding `CosmosOperation::read_all_items(...)` factory backed by the query, which can be cheaper RU-wise on large containers. The plan model and continuation token format above already accommodate this — only the payload variant and the chosen wire shape differ — so this is purely additive. + +--- + +## 13. Implementation Plan + +This section is the execution checklist for landing the spec. It is split into **two +PR-sized phases**. Each phase ends with a working, mergeable, end-to-end slice — Phase 1 +unblocks single-partition queries; Phase 2 unblocks cross-partition queries. + +The plan is deliberately mechanical so a follow-up coding agent can execute it without +re-deriving design decisions. Cross-references to the design sections above are inline. + +### 13.0 Conventions + +- All new public types live in `azure_data_cosmos_driver` and are re-exported from + `azure_data_cosmos` only when the SDK layer needs them in its public surface. +- All driver-internal types are `pub(crate)`. +- New code must derive `SafeDebug` (not `Debug`) for any type that may carry user data. +- Every public type needs a doc comment summary + details. +- Each phase ends with: `cargo fmt -p azure_data_cosmos_driver -p azure_data_cosmos`, + `cargo clippy -p azure_data_cosmos_driver -p azure_data_cosmos --all-features`, + `cargo test -p azure_data_cosmos_driver -p azure_data_cosmos --all-features`. All three + must be clean before opening the PR. +- **Integration tests run against the live Cosmos DB Emulator**, not via test-proxy + recordings. Agents should expect the user to assist with starting / pointing at the + emulator and reviewing the resulting test runs. +- **Serialization tests never use round-trip assertions.** For each `Display` / + `Serialize` impl, assert against an exact expected string. For each `FromStr` / + `Deserialize` impl, feed an exact input string and assert the parsed structure. Tests + MAY locally base64-decode the wire format inside the test body so that the on-disk + exemplar can stay as plain JSON. + +--- + +### 13.1 Phase 1 — Single-Node Plans + +**End-state for Phase 1.** A user can write: + +```rust +let pager = container.query_items::( + "SELECT * FROM c WHERE c.region = 'westus'", + OperationTarget::partition("westus"), + None, +)?; +while let Some(page) = pager.next().await { + let page = page?; + // ... use page.items, page.continuation_token() ... +} +``` + +…and every existing point operation (`read_item`, `create_item`, `delete_item`, etc.) +goes through the new Plan → Execute pipeline. Cross-partition / `FeedRange` targets are +type-acceptable but error at planning time pointing to "Phase 2". + +#### 13.1.1 Foundational types in `azure_data_cosmos_driver` + +These can be done in any order but must all land before the planner work. + +1. **Migrate `FeedRange` into the driver and extend it.** §3.2.1. + - Move `sdk/cosmos/azure_data_cosmos/src/feed_range.rs` to + `sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs`. Add to + `models/mod.rs`. Make the type `pub`. + - Add `pub use azure_data_cosmos_driver::models::FeedRange;` to + `azure_data_cosmos/src/lib.rs` (or wherever the existing re-export lives) so + the public API does not change. + - Update the driver's internal callers that previously used the old path. + - **Add `pub fn FeedRange::for_partition_key(pk: impl Into, definition: &PartitionKeyDefinition) -> azure_core::Result`.** + Computes the EPK for the given partition-key value and returns a single-EPK + `FeedRange` whose `min_inclusive == max_exclusive == EPK(pk)`. Returns an error if + the value count does not match the definition (full keys only — Phase 1 does not + attempt to support MultiHash prefix keys here; defer that to a follow-up if + needed). + - **Add `pub fn FeedRange::is_singleton(&self) -> bool`** returning `true` iff the + range bounds collapse to a single EPK (i.e. inclusive lower bound equals the + bound that would otherwise be exclusive — the implementation should treat the + "singleton" representation as a closed-closed range over one EPK or use whatever + internal marker `for_partition_key` produces, as long as the predicate is exact + for those constructed values and `false` for any range covering more than one + EPK). + - **Remove `ContainerClient::feed_range_from_partition_key`** from the SDK — its + functionality is now `FeedRange::for_partition_key` (callers can fetch the + `PartitionKeyDefinition` from the container themselves, or via a thin sync + helper on `ContainerClient` that returns the definition). Update the changelog. + - **Acceptance:** `cargo build -p azure_data_cosmos -p azure_data_cosmos_driver`. + +2. **Add `OperationTarget`.** §3.2. + - New file `models/operation_target.rs`. Public enum with three variants + (`None`, `PartitionKey { key: PartitionKey, feed_range: FeedRange }`, + `FeedRange(FeedRange)`). + - The `PartitionKey` variant carries both the logical key (used for the + `x-ms-documentdb-partitionkey` header on gateway-issued requests) and its + singleton `FeedRange` (the range whose min/max EPK equals `EPK(key)`). + **The EPK headers are NOT used for `OperationTarget::PartitionKey` requests** — + the gateway routes by the logical-PK header. The singleton `FeedRange` exists so + downstream planning, continuation tokens, and merge-recovery can reason about a + PK target uniformly with `FeedRange` targets. + - Implement the named constructors: + - `pub fn partition(key: impl Into, definition: &PartitionKeyDefinition) -> azure_core::Result` — + computes the singleton `FeedRange` via `FeedRange::for_partition_key` and + stores both. Errors propagate from `for_partition_key`. + - `pub fn feed_range(impl Into) -> Self`. + - `pub fn all_ranges() -> Self` — returns `Self::FeedRange(FeedRange::full())`. + - Add accessors: `pub fn partition_key(&self) -> Option<&PartitionKey>`, + `pub fn feed_range(&self) -> Option<&FeedRange>` (returns `Some` for both + `PartitionKey` and `FeedRange` variants). + - Do **not** implement `From` or `From`. §10.6. + - Re-export from `models/mod.rs`. + - **Note on the SDK's `ContainerClient::query_items` ergonomics:** since + `OperationTarget::partition` requires a `PartitionKeyDefinition`, the SDK + provides a thin wrapper `ContainerClient::partition_target(key) -> OperationTarget` + that pulls the definition from `container_connection`. Driver-level callers that + already hold a `PartitionKeyDefinition` (e.g. existing point-op factories) call + `OperationTarget::partition` directly. + +3. **Add `OperationPayload`.** §3.1. + - New enum with `None`, `Body(Vec)`, `Query { query, parameters }`. + - Define the Phase-1 `QueryParameter` shape: `{ name: String, value: serde_json::Value }` + unless an equivalent already exists; if one does, reuse it. + - Add a `pub fn body(&self) -> Option<&[u8]>` for transport-layer convenience. + +4. **Add `OperationOverrides`.** §6.3. + - Fields exactly: `feed_range: Option`, `continuation: Option`, + `max_item_count: Option`. `Default`-derive friendly. + - Document the allow/deny list inline. + +5. **Add `ContinuationToken` (Request variant only).** §7. + - `pub struct ContinuationToken { inner: ContinuationTokenInner }`. + - Internal `ContinuationTokenInner { version, container_rid, operation_kind, resume }`. + - `enum ResumeState { Request(RequestState) }` only — `SequentialDrain` is added in + Phase 2. + - `RequestState` carries `server_token: String` plus a `target` discriminator that + captures the original `OperationTarget`: + - For `OperationTarget::PartitionKey { key, .. }`: store the **logical partition + key value** (serialized via `PartitionKey`'s existing wire form). On resume, + the planner reconstructs the singleton `FeedRange` by re-running + `FeedRange::for_partition_key` with the container's current + `PartitionKeyDefinition`. + - For `OperationTarget::FeedRange(_)`: store `epk_min` / `epk_max`. + - For `OperationTarget::None`: store nothing extra (control-plane resumes are + rare but the variant exists for symmetry). + - Implement `Display` (base64url-encoded JSON, no padding) and `FromStr` (decode + + version check). Tests assert the exact JSON exemplar — see §13.1.7. + - Const `CURRENT_TOKEN_VERSION: u32 = 1`. + - **Version preservation rule** (§7.1): emit output tokens at the same version as the + input token; emit `CURRENT_TOKEN_VERSION` only when there is no input token. + This only applies if the incoming version is KNOWN to the Driver, so nothing is + actually needed here since there is only one version and the Driver already + emits it. + +6. **Add `ResponseBody` and update `CosmosResponse`.** §10.2. + - Variants `None`, `Bytes(Vec)`, `Items(Vec>)`. + - Replace `CosmosResponse::body() -> &[u8]` with `body() -> &ResponseBody` (and + remove `into_body`). Update every caller. Convenience `as_bytes(&self) -> Option<&[u8]>` + is acceptable to ease migration. + - Add `continuation_token: Option` field + `continuation_token()` + accessor. + +#### 13.1.2 `CosmosOperation` refactor + +§3. + +7. **Field swap.** Replace `body: Option>` with `payload: OperationPayload`; + replace `partition_key: Option` with `target: OperationTarget`. + Keep `with_body(Vec)` as sugar for `with_payload(OperationPayload::Body(...))`. + Add `with_payload(OperationPayload)` and `with_target(OperationTarget)`. + +8. **Update all existing factory methods** in `models/cosmos_operation.rs` to populate + `target` and `payload` correctly (every `read_item`, `create_item`, `delete_item`, + `batch`, `query_items`, `read_all_databases`, etc.). Point ops use + `OperationTarget::partition(pk)`; account/database ops use `OperationTarget::None`; + existing `query_items` factory keeps `OperationTarget::partition(pk)` for now. + +9. **Add `CosmosOperation::query(container, query, parameters)`.** §3.3. Defaults to + `OperationTarget::all_ranges()` so callers that target a single PK must call + `.with_target(OperationTarget::partition(pk))`. + +#### 13.1.3 Pipeline rename + overrides plumbing + +10. **Rename `execute_operation_pipeline` → `execute_single_operation`.** §2 / §6.3. + - New signature: + `async fn execute_single_operation(&self, operation: &CosmosOperation, options: &OperationOptions, overrides: &OperationOverrides, diagnostics: &mut DiagnosticsContextBuilder) -> Result`. + - Call sites within the driver pass a default `OperationOverrides` for now (point + ops don't use it). + +11. **Apply overrides in the request builder.** + - `overrides.continuation` → `x-ms-continuation` header. + - `overrides.max_item_count` → `x-ms-max-item-count` header (falls back to + `options.max_item_count` if unset). + - `overrides.feed_range` → `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` + headers, **only when the operation's `target` is NOT `OperationTarget::PartitionKey`**. + Gateway-routed PK queries use the logical-PK header instead and rejecting both + together is the simplest correctness rule. The planner is responsible for never + setting `overrides.feed_range` on a PK-targeted node, but the request builder + enforces the invariant defensively (debug-assert in the builder). + Phase 1 never sets EPK headers from the planner, but the wiring must be in + place for Phase 2 — add a unit test that sets `overrides.feed_range` on a + `OperationTarget::None` operation to lock the behavior. + +12. **Translate `OperationPayload` to wire body.** + - `None` → no body. + - `Body(b)` → bytes verbatim, `Content-Type: application/json`. + - `Query { query, parameters }` → JSON envelope `{"query":..., "parameters":[...]}`, + `Content-Type: application/query+json`. Set `x-ms-documentdb-isquery: True` and + `x-ms-documentdb-query-iscontinuationexpected: True` for the cross-partition path + later, but for Phase 1 single-partition queries set `IsContinuationExpected: True` + and **omit** `x-ms-documentdb-query-enablecrosspartition`. + +13. **Extract response continuation.** When `x-ms-continuation` is present on the + response, set `CosmosResponse.continuation_token` to a `ContinuationToken` whose + `ResumeState::Request` carries: + - the `server_token` from the header, + - the `target` discriminator captured from the originating `CosmosOperation`: + - `OperationTarget::PartitionKey { key, .. }` → store the logical PK value + (NOT the EPK bounds — those are reconstructed from the definition on resume), + - `OperationTarget::FeedRange(fr)` → store `fr`'s `min_inclusive` / `max_exclusive`, + - `OperationTarget::None` → no extra fields. + - The `container_rid` and `operation_kind` come from the `CosmosOperation` / + `OperationType`. Use `"query"` for `OperationType::Query`, the operation type + name for everything else. + +14. **`ResponseBody` variant selection.** + - For `OperationPayload::Query`, parse the response envelope and emit + `ResponseBody::Items(Vec>)`. The driver does **not** deserialize items — + it slices the `Documents` array into a `Vec>` of raw JSON values. (Use + `serde_json::value::RawValue` or equivalent to avoid double-parse.) + - For all other operations, emit `ResponseBody::Bytes(...)` for non-empty bodies and + `ResponseBody::None` for 204 / empty bodies. + +#### 13.1.4 Plan model + planner + executor (single-node only) + +§4–§6. + +15. **Plan types — minimal shape.** + - `pub(crate) enum OperationPlan { SingleNode(PlanNode) }` — no `Graph` variant yet + (defer to Phase 2 with a `// TODO(phase-2): Graph variant`). + - `pub(crate) enum PlanNode { Request { operation: Arc, options: OperationOptions, feed_range: Option, continuation: Option } }` — + no `SequentialDrain` yet. + - `NodeId` / `NodeRange` are NOT needed in Phase 1; add them in Phase 2. + +16. **`Planner::plan(operation, options, continuation)`.** §5. + - Always returns `Ok(OperationPlan::SingleNode(...))` in Phase 1. + - Acceptance rules: + - Any `OperationType` other than `Query` → SingleNode regardless of target. (Point + ops, batch, control-plane ops.) + - `OperationType::Query` with `OperationTarget::PartitionKey(_)` → SingleNode + fast-path (no query plan fetch). §5.2.1. + - `OperationType::Query` with `OperationTarget::FeedRange(_)` / + `OperationTarget::all_ranges()` → return + `Err(azure_core::Error::with_message(ErrorKind::Other, "cross-partition queries are not yet supported (planned for Phase 2)"))`. + Phase 2 lifts this. + - When a continuation token is present, validate: + - `version <= CURRENT_TOKEN_VERSION` (already checked in `FromStr`). + - `container_rid` matches `operation.target()`'s container RID. + - `operation_kind` matches. + - `ResumeState` is `Request` (Phase 1 only knows that variant). Otherwise + `ErrorKind::DataConversion`. + - The token's stored `target` discriminator matches `operation.target()`'s + variant. For a `PartitionKey` token, the stored PK value must equal + `operation.target()`'s PK value (otherwise reject with + `ErrorKind::DataConversion`). For a `FeedRange` token the stored EPK bounds + must equal the operation's `FeedRange`. + - Seed `PlanNode::Request.continuation` with `RequestState.server_token`. + +17. **`PlanExecutor::execute(plan, driver_context, diagnostics)`.** §6. + - Match `OperationPlan::SingleNode(node)` and dispatch to a private + `execute_request_node`. + - `execute_request_node` builds an `OperationOverrides` from the node's + `feed_range` + `continuation` + `options.max_item_count` and calls + `execute_single_operation`. Returns the `CosmosResponse` straight through. + +18. **`CosmosDriver::execute_operation` rewrite.** §10.1. + - Sequence: plan → execute one page → return the `CosmosResponse`. + - Continuation token already lives on the response from step 13. + - Existing point operations now go through this path. Verify with the existing + point-op integration test suite — must pass with no test changes. + +19. **Send-future invariant.** The existing + `_assert_execute_operation_future_is_send` compile-time assertion must continue to + hold after the rewrite. Do not introduce non-`Send` types into the plan/executor. + +#### 13.1.5 SDK surface (`azure_data_cosmos`) + +§10.6. + +20. **Re-export `OperationTarget`.** Add `pub use azure_data_cosmos_driver::models::OperationTarget;` + next to the existing `FeedRange` re-export. + +21. **`QueryOptions` change.** Add `pub continuation: Option` and + `pub max_item_count: Option`. Add chained setters + `with_continuation(...)`, `with_max_item_count(...)`. Keep `session_token` and + `operation` fields unchanged. + +22. **`query_items` signature change.** Replace + `partition_key: impl Into` with `target: OperationTarget`. Update doc + comments and example snippets to use `OperationTarget::partition(...)`. + - In Phase 1, calling with `OperationTarget::feed_range(_)` / + `OperationTarget::all_ranges()` returns the planner error from step 16. The SDK + does not need its own validation — it surfaces the driver error. Add a doc note + that cross-partition support arrives in Phase 2. + +23. **Pager-style return value.** The existing `query_items` returns + `FeedItemIterator` built by `QueryExecutor::into_stream()`. Replace its + implementation so each underlying page is produced by + `driver.execute_operation(op, opts)`: + - Build the `CosmosOperation::query(...)` once with the user's `target`. + - Loop: feed `options.continuation` into `OperationOptions`, await + `execute_operation`, emit a `FeedPage` from `ResponseBody::Items`, set the + next-iteration continuation from the response, stop when no continuation is + returned. + - `FeedPage::continuation_token()` exposes the `ContinuationToken` so callers can + pause / resume across process boundaries. + - Item deserialization happens here in the SDK (`T: DeserializeOwned`), not in the + driver. + +24. **Update other call sites.** Anything in the SDK that builds a + `CosmosOperation::query_items(...)` factory call must be updated for the new field + layout. Should be confined to `container_client.rs`. + +#### 13.1.6 Diagnostics (deferred) + +Hierarchical `NodeDiagnostics` (§8) are **deferred to Phase 2 or later**. Phase 1 keeps +the existing flat `RequestDiagnostics` list. Document this in the PR description. + +#### 13.1.7 Tests + +> **Note:** No round-trip tests. Each serialization test pins an exact expected +> string; each deserialization test feeds an exact input. Tests MAY base64-decode the +> wire form locally so the JSON exemplar in the test is human-readable. + +25. **Driver unit tests.** + - `OperationTarget::partition(...)` populates both the logical PK and the + singleton `FeedRange` produced by `FeedRange::for_partition_key(...)`. + - `OperationTarget::feed_range(...)` and `all_ranges()` produce the expected + variants. + - `FeedRange::for_partition_key`: for a known PK value + definition, assert + `min_inclusive == max_exclusive == ` (use a hand-computed + EPK fixture). + - `FeedRange::is_singleton`: `true` for any `FeedRange::for_partition_key(...)` + output; `false` for `FeedRange::full()` and a multi-EPK fixture. + - `OperationPayload::Query` envelope serializes to the exact string + `{"query":"SELECT * FROM c","parameters":[]}` (and a parametrized variant with + the exact expected JSON, asserting field order / casing). + - `ContinuationToken` serialization: build a token with known fields, assert + `token.to_string()` equals an exact base64url string. Provide the JSON + exemplar inside the test body and base64url-encode it locally to derive the + expected string. + - `ContinuationToken` deserialization: feed a known base64url input, assert the + parsed `ContinuationTokenInner` field-by-field. + - `ContinuationToken` for a `PartitionKey`-target token preserves the original + logical PK value across decode (assert the PK value equals the input). + - `ContinuationToken` parse rejects: version > current; bad base64; bad JSON; + missing required fields. One assertion per failure mode. + - `Planner` returns `SingleNode` for every point op type (table-driven test). + - `Planner` returns `SingleNode` for `Query` + `PartitionKey`. + - `Planner` returns the Phase-2 error for `Query` + `FeedRange` / + `Query` + `all_ranges()`, asserting the exact error message. + - `Planner` rejects a continuation token whose stored PK value differs from the + operation's target PK (`ErrorKind::DataConversion`). + - `OperationOverrides` → request headers: lock-in test that asserts + `x-ms-continuation` and `x-ms-max-item-count` appear when the corresponding + override is set; assert that EPK headers DO NOT appear when the operation's + target is `OperationTarget::PartitionKey`; assert that EPK headers DO appear + when the target is `OperationTarget::None` or `OperationTarget::FeedRange(_)`. + - `ResponseBody::Items` parser: feed a fixture body, assert items extracted as + raw bytes without re-encoding (assert the `Vec>` byte-for-byte against + expected slices). + +26. **Driver integration tests (Cosmos DB Emulator).** + - All existing point-op tests pass unchanged. + - New: single-partition query against an emulator container + (`SELECT * FROM c`, `SELECT * FROM c WHERE c.id = @id`), pagination across + multiple pages by setting a small `max_item_count`. Tests provision their + own container, seed deterministic data, and assert exact item sets. + **Do not introduce any test-proxy recordings** — the existing point-op + recordings stay as-is, but new feed-operation tests run live against the + emulator only. + +27. **SDK integration tests (Cosmos DB Emulator).** + - `query_items` with `OperationTarget::partition(pk)` returns the expected + items. + - `query_items` with a `WHERE` clause filters server-side. + - Pagination: drain a multi-page result, then resume from a captured + continuation token mid-stream and verify the second half matches. + - Continuation token from a `partition`-target query, when handed back to a + fresh `query_items` call with the same logical PK, resumes correctly. When + handed to a different PK, it is rejected. + +--- + +### 13.2 Phase 2 — Sequential Drain & Multi-Node Plans + +**End-state for Phase 2.** A user can write: + +```rust +let pager = container.query_items::( + "SELECT * FROM c WHERE c.year = @y", + [Parameter::new("@y", 2026)], + OperationTarget::all_ranges(), // or feed_range(fr) + None, +)?; +``` + +…and the driver plans a `SequentialDrain` over every PK range that intersects the +target, draining them one at a time, paginating across calls, surviving partition +splits and merges via EPK headers, and producing a continuation token after each page. + +The Phase-2 PR is purely additive on top of Phase 1: the Phase-1 SingleNode fast-path +remains the path for `OperationTarget::PartitionKey` queries. + +#### 13.2.1 Plan model expansion + +§4. + +1. **Add `NodeId` and `NodeRange`.** `pub(crate)`, `Copy`, with the `len`, `is_empty`, + `iter` helpers. + +2. **Extend `OperationPlan` with `Graph { nodes: Vec, root: NodeId }`.** + Remove the Phase-1 `// TODO(phase-2)` comment. + +3. **Add `PlanNode::SequentialDrain { children: NodeRange }`.** Document the bottom-up + invariant inline. + +4. **Update existing pattern matches.** Anywhere that matched on `OperationPlan` / + `PlanNode` now needs to handle the new variants. The compiler enforces this. + +#### 13.2.2 Backend Query Plan request + +§5.2.1 / §12.2. + +5. **Add `BackendQueryPlan` types.** New module + `driver/query_plan/{mod.rs,backend_plan.rs}`. **There is no `QueryPlanClient`** — + the planner issues query-plan requests directly through `execute_single_operation` + (see step 6). + - Mirror the schema noted in the team's existing memory (camelCase JSON): + `partitionedQueryExecutionInfoVersion`, `queryInfo`, `queryRanges`, + `hybridSearchQueryInfo`. For Phase 2 we only need: `queryInfo.rewrittenQuery` (must + be empty / absent for in-scope queries), `queryInfo.hasNonStreamingOrderBy` (must be + `false`), `queryInfo.aggregates` (must be empty), `queryInfo.groupByExpressions` + (must be empty), `queryInfo.distinctType` (must be `None`), `queryInfo.orderBy` + (must be empty), `queryInfo.dCountInfo` (must be absent), `queryInfo.top` (must be + absent), `queryInfo.offset` / `limit` (must be absent), and `queryRanges`. + +6. **Issue the query-plan request inline from the planner.** Add a helper + `Planner::fetch_backend_plan(operation: &CosmosOperation) -> Result` + that: + - Builds a synthetic `CosmosOperation` whose target is `OperationTarget::None` + and whose payload is the same `OperationPayload::Query` as the user's request. + - Calls `execute_single_operation` directly with an `OperationOverrides` that + sets the query-plan headers: + `x-ms-cosmos-is-query-plan-request: True`, + `x-ms-cosmos-supported-query-features: None`, + `x-ms-cosmos-query-version: 1.0`, + `Content-Type: application/query+json`, + `x-ms-documentdb-query-iscontinuationexpected: False`. + (Extend `OperationOverrides` with a small `extra_headers: Vec<(HeaderName, HeaderValue)>` + field if no cleaner mechanism exists, or — preferred — add a private + `RequestKind::QueryPlan` discriminant on the synthetic operation so the + transport layer applies the right headers without growing the public + override type. Pick the smaller diff.) + - Bypasses planning recursion (the planner calls `execute_single_operation` + directly, not `execute_operation`). + - Parses the response body into `BackendQueryPlan`. + +7. **Classify the plan as passthrough.** + - Helper `BackendQueryPlan::is_passthrough(&self) -> bool` returning `true` iff every + "must be empty/false/absent" check above passes. + - If `false`, the planner returns + `Err(azure_core::Error::with_message(ErrorKind::Other, "this query requires features that are not yet supported by the Rust SDK (cross-partition ORDER BY / GROUP BY / aggregates / vector / hybrid)"))`. + +#### 13.2.3 Planner: cross-partition `SELECT *` + +§5.4. + +8. **Wire `PartitionKeyRangeCache` into the planner.** Pass an `Arc` + on `Planner::new(...)`. Add a `fetch_pk_ranges_for_target` helper that returns the + list of `PartitionKeyRange`s overlapping the target's `FeedRange`. + +9. **Implement cross-partition planning.** + - Phase-1 SingleNode acceptance rules unchanged. + - For `Query` + (`FeedRange(_)` | `all_ranges()`): + 1. Fetch the backend query plan (step 6). Cache by `(container_rid, query_text, + params_hash)` for the lifetime of one `execute_operation` call. (Cross-call + caching is §12.5 future work.) + 2. Verify passthrough (step 7). + 3. Compute the effective EPK bounds: intersect `target.feed_range()` with each PK + range from the cache. Filter out PK ranges that do not overlap. + 4. Apply continuation-token resume: if the input continuation has + `ResumeState::SequentialDrain { epk_min, epk_max, server_token }`, drop ranges + whose `max_epk <= epk_min`; the first remaining range carries `server_token`, + the rest start fresh. §7.3. + 5. Build nodes bottom-up: push N `PlanNode::Request` (each carries + `Arc` shared across siblings, its EPK range, and any seeded + continuation), then push `PlanNode::SequentialDrain { children: NodeRange(0..N) }`. + `root = NodeId(N)`. + +10. **Single-Request degenerate case.** If after intersection N == 1, still emit a + `Graph` plan (do **not** silently downgrade to SingleNode) so the executor can + produce a `ResumeState::SequentialDrain` token consistent across pages. + +#### 13.2.4 Executor: SequentialDrain walk + +§6.1. + +11. **Match `OperationPlan::Graph`** in `PlanExecutor::execute`, look up `root`, dispatch. + +12. **`execute_sequential_drain`.** + - Pick the active child = `children.start` (Phase-2 invariant: planner has already + pruned drained ranges). + - Acquire a single concurrency permit (sequential: a `Semaphore::new(1)` per drain; + the permit machinery is §5.6 future work but we add a minimal stub now so that + future variants slot in cleanly — a `tokio::sync::Semaphore` is fine). + - Execute the active child via `execute_request_node`. + - Build the output `ContinuationToken` (step 14). + - Return the `CosmosResponse` with the new token attached. + +13. **Page-boundary rule.** Each `execute_operation` call returns exactly one page from + one partition. Even if the active child completes (no server continuation) and there + are more children, do NOT proactively start the next one in the same call — the + output continuation simply moves the cursor to the next range. §10.5. + +14. **Output continuation construction.** + - If the executed `Request` returned a server continuation: emit + `ResumeState::SequentialDrain { epk_min, epk_max, server_token: Some(...) }` for + the active range. + - If it did not, and there are more children to the right: emit + `ResumeState::SequentialDrain { epk_min, epk_max, server_token: None }` so the + Planner on the next call skips this range and starts the next. + - If it did not, and the active child was the last one: emit `None` continuation + (operation complete). + +#### 13.2.5 Continuation token expansion + +§7. + +15. **Add `ResumeState::SequentialDrain(SequentialDrainState)`.** Add explicit + serialization tests (assert against an exact base64url string) and explicit + deserialization tests (feed an exact input, assert parsed structure + field-by-field). No round-trip tests. + +16. **Version preservation across resume.** Already handled by Phase 1's "echo the input + version" rule — verify it continues to apply when the input is a `SequentialDrain` + token. + +17. **Reject cross-variant tokens.** A `SequentialDrain` token presented to a Phase-1 + SingleNode operation is rejected (`ErrorKind::DataConversion`). A `Request` token + presented to a cross-partition operation is also rejected. + +#### 13.2.6 Split / merge handling + +§9.1 / §9.2. + +18. **Inside `execute_request_node`, on `Status 410 SubStatus 1002`:** + - Invalidate `PartitionKeyRangeCache` for the container. + - Re-fetch the PK ranges intersecting the node's `feed_range`. + - For each new sub-range, issue a follow-up `execute_single_operation` call with + `OperationOverrides.feed_range` set to the **original** node EPK bounds (so EPK + headers narrow the result to the node's slice even on a merged PK range). + - Concatenate the resulting `ResponseBody::Items` into one page-output. + - The continuation token logic (step 14) still applies to the original EPK bounds, + not the new sub-range bounds; the next planner call will see the new topology. + +19. **Pipeline-level helper.** Make sure the transport layer always emits + `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` whenever + `OperationOverrides.feed_range` is set. (Hooked up in Phase 1 step 11.) + +#### 13.2.7 SDK surface adjustments + +20. **Lift Phase-1 restriction.** The planner now accepts `FeedRange` and `all_ranges()` + targets, so the SDK's `query_items` automatically gains cross-partition support — no + code change required at the SDK boundary beyond updating doc comments and adding + examples. + +21. **Pager loop.** The Phase-1 pager loop in `query_items` works unchanged — it just + drains more pages now. + +22. **`FeedRange::for_partition_key` interaction.** Confirm that + `FeedRange::for_partition_key(pk, &definition)` (added in Phase 1, step 1) + returns a `FeedRange` that, when handed to `OperationTarget::feed_range(...)`, + drives a single-PK-range `SequentialDrain` and returns the same items as + `OperationTarget::partition(pk, &definition)`. Add an emulator integration + test that asserts the two paths produce the same item set. + +#### 13.2.8 Diagnostics (now or split into a follow-up PR) + +23. Decide based on PR size. If hierarchical `NodeDiagnostics` (§8) fits, add it here: + `start_node` / `complete_node` builder methods, recursive collection in the executor, + flat `requests()` accessor for back-compat. If not, file a follow-up issue and ship + Phase 2 with the existing flat diagnostics — this is purely an observability + enhancement and does not affect correctness. + +#### 13.2.9 Tests + +> Same testing rules as §13.1.7: no round-trip serialization tests; integration tests +> run against the Cosmos DB Emulator, not test-proxy recordings. + +24. **Planner unit tests.** + - Cross-partition `SELECT *` against 3 PK ranges produces a Graph with 3 Request + children + 1 SequentialDrain root, bottom-up. + - Resume from `ResumeState::SequentialDrain` skips left-of-cursor ranges, seeds the + active range's continuation, leaves right-of-cursor ranges unseeded. + - Backend query plan rejection: a query with cross-partition `ORDER BY` is rejected + with the documented error message (assert exact string). + - Single-PK degenerate case still emits `Graph`, not `SingleNode`. + +25. **Executor unit tests (with mock pipeline).** + - SequentialDrain processes one page per call; continuation token threads correctly. + - Active-range completion (no server token, more children) emits a token with + `server_token: None` that on the next call skips to the next range. + - Last range exhausted → `continuation_token` is `None`. + +26. **Continuation token tests.** + - `SequentialDrain` serialization: build a token with known fields, assert + `to_string()` equals an exact base64url string (JSON exemplar provided in the + test, base64url-encoded locally). + - `SequentialDrain` deserialization: feed an exact base64url string, assert the + parsed structure field-by-field. + - Cross-variant rejection (Phase 1 `Request` token vs Phase 2 `SequentialDrain` + operation, both directions). + - Version preservation: input v1 token → all output tokens are v1, even if + `CURRENT_TOKEN_VERSION` has bumped (assert by constructing a v1 input token, + executing one page, asserting the output token's version field equals 1). + +27. **Split / merge tests.** + - Unit-level fault injection: inject a 410/1002 response in the mock pipeline; + verify the Request node re-resolves the PK ranges, issues follow-up calls + with EPK headers narrowing to the original node bounds, and the page completes + without surfacing the error to the caller. + - Verify the EPK headers on the post-split sub-requests by inspecting the + mock pipeline's recorded requests. + - Emulator integration: there is no portable way to force a split on the + emulator, so split/merge coverage is unit-level only. Note this in the PR. + +28. **End-to-end SDK integration tests (Cosmos DB Emulator).** + - `OperationTarget::all_ranges()` with `SELECT * FROM c` against a multi-partition + container drains every item exactly once, in `(EPK, RID)` order. + - Same with a `WHERE` clause: only matching items returned. + - `OperationTarget::feed_range(FeedRange::for_partition_key(pk, &def)?)` returns + the same set as `OperationTarget::partition(pk, &def)` for the same logical + partition. + - Mid-stream pause/resume: capture the continuation after page 2, build a fresh + `query_items` call with the same query + the captured token, verify the + remaining pages match. + +--- + +### 13.3 Out of Scope for Both Phases + +- Cross-partition `ORDER BY`, `GROUP BY`, aggregates, vector, hybrid (§12.2). +- ReadMany (§12.1). +- Change feed (§12.3). +- Hierarchical OTEL spans built by the SDK from `NodeDiagnostics` (the SDK owns this, + not the driver; spec'd in §8.7 but not on the implementation critical path). +- Cross-call `OperationPlan` caching (§12.5). +- Hedging on feed nodes (§12.6). +- Dedicated `read_all_items` factory (§12.7). From 682f42856ae9ca7b82d4bc3bd74667296dc32095 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 11:24:47 -0700 Subject: [PATCH 12/29] Restructure to a simpler requirements document instead of a detailed spec/plan --- .../docs/FEED_OPERATIONS_REQS.md | 180 ++ .../docs/FEED_OPERATIONS_SPEC.md | 2819 ----------------- 2 files changed, 180 insertions(+), 2819 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_REQS.md delete mode 100644 sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_REQS.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_REQS.md new file mode 100644 index 00000000000..34f854b214c --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_REQS.md @@ -0,0 +1,180 @@ +# Feed Operations — Requirements & Design Primer + +**Crate:** `azure_data_cosmos_driver` +**Scope:** Driver-internal architecture for feed operations (queries, future read-many, change feed) +**Current focus:** `SELECT * [WHERE ]` using natural order + +--- + +## 1. Context + +The driver currently handles only point operations (single request → single response). Feed operations produce multiple pages of results, may span many physical partitions, and require resumable pagination state that survives process boundaries. + +Feed operations must flow through the same execution infrastructure as point operations (region failover, session tokens, retry, diagnostics) without penalizing point-operation latency. We are doing multi-millisecond network I/O per page fetch, so the design optimizes for clarity and correctness over nanosecond-level micro-optimization. + +--- + +## 2. Dataflow Pipeline + +All operations — point and feed — are expressed as a **Dataflow Pipeline**: a tree of nodes where leaf nodes perform I/O and intermediate nodes perform sequencing or aggregation. + +### Structure + +- The pipeline is a **tree**. Nodes own their children. Fan-out creates branching. +- **Leaf nodes** issue a single Cosmos DB request via the existing operation pipeline (retry, failover, auth, transport). +- **Intermediate nodes** orchestrate their children. The first intermediate node type is `SequentialDrain`, which iterates children in EPK order, fully draining one before advancing to the next. +- **Trivial pipelines** (point operations, single-partition feeds) are a single leaf node with no intermediate parent. These must add near-zero overhead compared to today's direct execution path. + +### Pipeline Lifecycle + +- `execute_operation` is called once per page. Each call advances the pipeline by one page of results from one physical partition. +- The pipeline object itself is the in-process iteration state. The consumer of the driver (SDK layer) is responsible for holding the pipeline across calls. +- For cross-process resumption, the pipeline state serializes to a `ContinuationToken` string. On resume, the token reconstitutes a pipeline at the correct position. +- It is cleanest to unify these: `ContinuationToken` holds the live pipeline object in-process, and produces the serialized string form on demand. + +### Future Node Types (Design For, Don't Implement Yet) + +- **UnorderedMerge**: concurrent fan-out, results returned in arrival order (Read Many). +- **StreamingOrderedMerge**: k-way merge of pre-sorted partition streams (streaming ORDER BY). +- **BufferedOrderedMerge**: collect all results, then sort (non-streaming ORDER BY). +- **HybridSearch**: issues multiple distinct sub-queries (e.g., vector similarity + full-text keyword) against different child pipelines, then combines/re-ranks their results. Demonstrates that an intermediate node may have heterogeneous children with different semantics. +- **Aggregate**: client-side aggregation across partitions. + +--- + +## 3. Key Invariants + +### Ordering + +When no `ORDER BY` is specified, the driver guarantees results in **(EPK, RID) ascending order**. Within each physical partition, the server returns items in ascending RID order. Across partitions, the driver iterates in ascending EPK order. This is a driver-level guarantee for `SELECT *` queries. + +### Page Boundaries & Suspension + +- For the initial `SequentialDrain` implementation, suspension occurs at page boundaries. A continuation token for this node type only needs to track which partition is active and the server's opaque page token for that partition. +- The continuation token design must allow future node types to store intra-page progress (e.g., a streaming ORDER BY merge may suspend mid-page when its output buffer is full but source partitions are partially consumed). +- A given server continuation token only guarantees you get the *next* page of results from that partition — even if the SDK presents a per-item iterator to the user. + +### Fan-Out Limit + +Cross-partition queries are expensive by design. Containers may have hundreds of thousands of physical partitions; unbounded fan-out is dangerous from a performance and scalability perspective. + +- **Max fan-out**: The pipeline refuses to plan an operation spanning more than N physical partitions. Default: **100**. Configurable by the caller for workloads that intentionally query broadly. +- **Max concurrency**: A separate limit on concurrent in-flight requests within a single pipeline execution. Not needed for the initial `SELECT * WHERE` implementation (sequential drain uses concurrency = 1) but the limit must exist as a configuration point for future concurrent node types. + +### Partition Targeting + +An operation targets the key space in one of three mutually exclusive ways: + +1. **No partition scope** — account/database-level operations. +2. **Logical partition key** — point operations and single-partition feeds. Routes via the gateway using the PK header. No EPK headers. No fan-out. +3. **Feed range (EPK range)** — cross-partition feeds. Resolved to physical partition(s) at plan time. The full container is just the special case of `[min_epk, max_epk)`. + +These are mutually exclusive at the type level — not a runtime check. + +--- + +## 4. Continuation Token & Resumption + +### Dual Nature + +The `ContinuationToken` type serves two roles: + +1. **In-process**: holds the live pipeline state. The SDK keeps it across `execute_operation` calls. No serialization needed per page. +2. **Cross-process**: serializes to an opaque string (base64url-encoded JSON). Safe to store in databases, send to browsers, carry across SDK upgrades. + +### Token Properties + +- **Durable across SDK versions.** Newer SDKs must deserialize tokens from older SDKs. Version field is the option of last resort. +- **O(1) size for sequential drain.** Only the active partition's EPK bounds and server continuation are stored. Drained partitions are reconstructed from the EPK cursor on resume. +- **Bound to the operation.** Tokens include a container RID and operation kind. Replaying a token against a different container or operation type is rejected. +- **Survives partition topology changes.** Tokens store EPK bounds, not physical partition IDs. Splits and merges are handled by re-resolving EPK bounds to current partitions. + +### What the Token Does NOT Encode + +- Query text or parameters (caller must supply an equivalent operation). +- Session tokens or consistency state. +- Per-partition state for all partitions (only the cursor position for sequential drain). + +--- + +## 5. Pipeline Repair (Splits) + +Physical partitions can split at any time. The pipeline must handle this transparently. + +### Leaf Node Invariant + +At all times, a leaf node targets **one specific physical partition** and **one EPK range** that is contained within that partition and does not overlap with any of its peer leaf nodes. A leaf node can only issue one request, so it is impossible for it to target multiple physical partitions. + +### Splits Break the Invariant + +When a physical partition splits, a leaf node's EPK range suddenly covers two or more new physical partitions. The pipeline detects this via a 410 (PartitionIsGone) response — either a full page is returned successfully or a 410 is returned; this never occurs mid-page. + +The leaf node is responsible for **splitting itself** to restore the invariant: + +1. Invalidate the cached partition map for the container. +2. Re-resolve the leaf's EPK range to the new physical partition(s). +3. The single leaf becomes multiple leaves in the parent's children list (the parent must obviously cooperate with this), each targeting one of the new physical partitions with a non-overlapping sub-range of the original EPK range. Depth of the tree remains the same. +4. Execution resumes against the correct new leaf. + +### Merges Do Not Require Repair + +After a merge, multiple leaf nodes may point to different EPK ranges on the same physical partition. This is acceptable — the leaf still targets a single partition and uses EPK min/max headers to scope its request to its intended slice. No pipeline restructuring is needed. (Consolidating redundant leaves after a merge is a potential future optimization but is out of scope to avoid complicating the design.) + +--- + +## 6. Current Implementation Focus + +The initial implementation targets `SELECT * [WHERE ]` queries: + +- **Single-partition**: trivial pipeline (one leaf node). The server evaluates the full SQL including any WHERE clause. Paginated via server continuations. +- **Cross-partition**: `SequentialDrain` intermediate node over N leaf nodes (one per physical partition). Drains partitions in EPK order. No query plan fetch required for passthrough SELECT/WHERE. + +### What This Exercises + +- Partition key range resolution and caching. +- Sequential traversal across partitions in EPK order. +- EPK range scoping via request headers. +- Paginated reads within each partition. +- Continuation token serialization, resume, and topology-change survival. +- Integration with the existing operation pipeline for each sub-request. +- Pipeline repair on partition splits/merges. + +--- + +## 7. Design Boundaries + +### The Driver Does NOT: + +- Deserialize item bodies. It returns raw bytes per item; the SDK handles deserialization. +- Create telemetry spans. It returns structured diagnostics data; the SDK creates OpenTelemetry spans. +- Own the iteration lifetime for multi-page feeds. It executes one page per call; the SDK loops. +- Fetch or interpret backend query plans (for the current SELECT/WHERE scope). + +### Item Body Opacity + +For `SequentialDrain`, item bodies are fully opaque binary payloads. The pipeline does not inspect them — ordering is already established by the backend. + +Future node types (e.g., streaming ORDER BY, hybrid search) may require partial parsing of item bodies. The backend query plan can rewrite the query to use a standardized envelope (promoting ordering keys to top-level fields and demoting the raw user document to a `payload` field). This varied-shape pattern must be considered in the overall design direction, but does not need to be accommodated in the current implementation. + +### The Driver DOES: + +- Plan the pipeline (determine targeting, resolve partitions, build the node tree). +- Execute one page per call through the existing retry/failover infrastructure. +- Produce and consume continuation tokens. +- Repair the pipeline on topology changes (splits/merges). +- Enforce fan-out limits. +- Collect per-node diagnostics for the SDK to surface. + +--- + +## 8. Future Considerations (Inform Design, Don't Implement) + +These capabilities must be achievable without redesigning the pipeline model: + +- **Streaming ORDER BY**: k-way merge of partition streams. Requires fetching a backend query plan to determine sort keys. New intermediate node type. +- **Buffered ORDER BY**: collect all partition results, sort client-side. Same query plan requirement. Different intermediate node. +- **Vector / Hybrid Search**: may require preliminary requests to fetch full-text statistics before issuing the main query. Multi-phase pipeline execution. +- **Read Many Items**: fan-out by (ID, PK) pairs grouped by partition. Concurrent leaf execution with an unordered merge intermediate node. +- **Change Feed**: per-range continuation tokens (O(N) token size, unlike sequential drain's O(1)). Different resumption semantics. + +The pipeline's tree structure, typed node hierarchy, and separation of planning from execution accommodate all of these as new node types and planning strategies without changing the core execution loop. diff --git a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md b/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md deleted file mode 100644 index 044e3b035b9..00000000000 --- a/sdk/cosmos/azure_data_cosmos_driver/docs/FEED_OPERATIONS_SPEC.md +++ /dev/null @@ -1,2819 +0,0 @@ -# Feed Operations Spec for `azure_data_cosmos_driver` - -**Status:** Draft / Iterating -**Date:** 2026-05-05 -**Authors:** (team) -**Crate:** `azure_data_cosmos_driver` - ---- - -## Table of Contents - -1. [Goals & Motivation](#1-goals--motivation) -2. [Architectural Overview](#2-architectural-overview) -3. [CosmosOperation Changes](#3-cosmosoperation-changes) -4. [Operation Plans](#4-operation-plans) -5. [Planner](#5-planner) -6. [Plan Executor](#6-plan-executor) -7. [Continuation Tokens](#7-continuation-tokens) -8. [Diagnostics Structure](#8-diagnostics-structure) -9. [Error Handling, Splits & Merges](#9-error-handling--partition-splits--merges) -10. [API Semantics & Invariants](#10-api-semantics--invariants) -11. [Testing Strategy](#11-testing-strategy) -12. [Future Work](#12-future-work) -13. [Implementation Plan](#13-implementation-plan) - ---- - -## 1. Goals & Motivation - -### Problem Statement - -The driver currently supports only **point operations** — operations that target a single resource -and produce a single response. Operations like `ReadItem`, `UpsertItem`, and `DeleteContainer` go -through `execute_operation`, which drives the operation pipeline (region failover, session tokens, -transport retry) and returns a single `CosmosResponse`. - -**Feed operations** — queries, read-many, and change feed — are fundamentally -different. They produce multiple pages of results, may span multiple partition key ranges, and -need pagination state that can be serialized across request boundaries. - -Today, feed operations are handled entirely in the higher-level `azure_data_cosmos` crate, bypassing -the driver's operation pipeline. This means feed operations miss out on the driver's multi-region -failover, partition-level circuit breaker, throughput control, and diagnostics infrastructure. - -### Goals - -1. **Unified execution model** — Both point and feed operations flow through a common - Plan → Execute pipeline. Point operations produce a trivial single-node plan. Feed operations - produce multi-node plans that leverage the existing point-operation pipeline for individual - Cosmos requests. - -2. **Resumable pagination** — Feed operations produce a typed continuation token that can be - serialized to a string and carried across process boundaries (e.g., sent to a browser). - Resuming with a valid continuation token and an equivalent operation descriptor continues - where the previous execution left off. - -3. **Extensible operation model** — The plan model must support `SELECT * [WHERE …]` queries - (the initial target), cross-partition queries with `ORDER BY` / aggregates, single-partition - queries/reads, read-many, and change feed, even if some are implemented later. - -4. **Driver-level concerns** — Feed operations must integrate with multi-region failover, - partition-level failover (PPAF/PPCB), throughput control, session consistency, and - diagnostics — all managed by the driver. - -5. **Schema-agnostic pages** — The driver returns feed pages as a list of pre-parsed item - bodies (`Vec>`), each entry being the raw serialized bytes of one item. Point - operations continue to return a single body (`Vec`). The driver does not deserialize - item bodies; the higher-level SDK handles deserialization. To support both shapes through - a single `CosmosResponse` type, this spec introduces a `ResponseBody` enum (analogous to - `OperationPayload` for requests) — see [§10.2 CosmosResponse Changes](#102-cosmosresponse-changes). - -6. **Performance non-regression** — Point operations must not pay measurable overhead for the - unified plan model. Trivial plans must be allocation-light. No heap allocation for trivial - plans beyond what `execute_operation` does today. No additional async machinery (no spawning, - no channels) for single-node plans. - -### Non-Goals (This Spec) - -- Cross-partition query execution with cross-partition `ORDER BY` merge-sort, `GROUP BY`, or - cross-partition aggregation (future work). Single-partition queries with `ORDER BY` / `GROUP BY` - / aggregates *are* in scope and pass through verbatim (see [§5.2](#52-planning-logic-by-operation-type)). -- Backend query plan retrieval and interpretation (future work; required for cross-partition - queries with `ORDER BY` / aggregates and for vector / hybrid queries, but not for the - in-scope cases). -- Change feed full design (future work; this spec reserves extension points). -- ReadMany fan-out with concurrent partition fetching (future work). -- Client-side query rewriting or optimization. -- Concurrent partition fetching or merge steps. - -### Primary Target - -**`SELECT * [WHERE ]` queries** are the first feed operations to implement. The -unfiltered form (`SELECT * FROM c`) is the simplest case; with an optional `WHERE` clause -the same code path supports server-side filtering. Both forms drain -partitions sequentially in effective partition key (EPK) order. Items are returned in their -**natural order**: ascending by `(EffectivePartitionKey, RID)`. Within each partition the server -returns items in ascending RID order; across partitions the driver iterates partitions in -ascending EPK order. - -This first target deliberately excludes any query feature that requires a backend query plan -to execute correctly across partitions — `ORDER BY` (cross-partition), `GROUP BY`, `DISTINCT`, -aggregates (`COUNT`, `SUM`, …), `OFFSET / LIMIT`, vector search, hybrid search, etc. Those are -covered separately under cross-partition queries in [§12.2](#122-cross-partition-queries). - -The in-scope shape exercises: - -- Partition key range resolution (via `PartitionKeyRangeCache`) -- Sequential traversal across partition key ranges in EPK order -- EPK range filtering via `x-ms-documentdb-epk-min` and `x-ms-documentdb-epk-max` headers -- Paginated reads within each partition -- Continuation token serialization and resume across SDK versions -- Integration with the operation pipeline for each sub-request -- Per-fetch header overrides (EPK bounds, server continuation, page size) applied without - rebuilding the base `CosmosOperation` — see [§6.3 OperationOverrides](#63-operationoverrides) - -This spec is **complete when `SELECT * [WHERE …]` works end-to-end** through the Plan → Execute -pipeline, both as a cross-partition operation (`OperationTarget::FeedRange`) and as a -single-partition operation (`OperationTarget::PartitionKey`). Sections on continuation tokens -and the plan model are designed to be extensible for future operations (ReadMany, -cross-partition query, change feed) without requiring a redesign. - -**Ordering semantics:** Cross-partition `SELECT *` drains partitions in EPK order as an -implementation behavior. Within each partition, items are returned in ascending RID order — -the natural sort order of `SELECT *`. The combined output is therefore ascending by -`(EffectivePartitionKey, RID)`. This is a driver-emitted ordering, **not** a service-level -ordering guarantee. The service does not guarantee global cross-partition order without -explicit `ORDER BY`. Single-partition queries (targeted via -`OperationTarget::PartitionKey`) preserve whatever order the server returns, including -`ORDER BY` results. - ---- - -## 2. Architectural Overview - -`CosmosDriver::execute_operation` is the single entry point for **all** operations — both -point and feed. The driver is stateless across calls: each invocation produces a fresh -`OperationPlan` (consulting the input continuation token if present), executes one page of -that plan, and returns a `CosmosResponse` with an optional continuation token. Point -operations always return without a continuation; feed operations return one when more pages -remain. The SDK layer decides which operations to expose to its callers as pagers. - -```mermaid -flowchart TB - Caller["SDK / caller
execute_operation(op, opts)"] - - subgraph Driver["CosmosDriver"] - direction TB - - Planner["Planner
──────────
Input: CosmosOperation + OperationOptions
Output: OperationPlan

• Determines targeting (point PK, FeedRange, full key space)
• Cross-partition SELECT *: resolves PK ranges → SequentialDrain over Request nodes
• Single-partition ops: single-node plan
• Point ops: trivial single-node plan"] - - Executor["PlanExecutor
──────────
Input: OperationPlan
Output: CosmosResponse (single page)

• Executes one Request node per call
• Handles partition splits / merges (Request re-resolves EPK → PK)
• Collects node-level diagnostics
• Builds continuation token if more pages remain"] - - Pipeline["Operation Pipeline (existing)
──────────
execute_single_operation()

• Region failover
• Session tokens
• Transport retry, auth, 429 backoff
• Per-request diagnostics"] - - Planner --> Executor - Executor --> Pipeline - end - - Caller -->|"CosmosOperation,
OperationOptions"| Planner - Pipeline -.->|"per-request
response"| Executor - Executor -->|"CosmosResponse
(+ continuation?)"| Caller - - classDef component fill:#f5f5f5,stroke:#333,stroke-width:1px,text-align:left - classDef caller fill:#e8f0ff,stroke:#333,stroke-width:1px - class Planner,Executor,Pipeline component - class Caller caller -``` - -Internally, every call follows the same three-step flow: - -1. **Plan** — the Planner converts the `CosmosOperation` (plus any input continuation - token) into an `OperationPlan`. -2. **Execute one page** — the PlanExecutor walks the plan and issues exactly one Cosmos - request via `execute_single_operation`. -3. **Respond** — the executor returns a `CosmosResponse`, attaching a continuation token - when more pages remain. - -### Layer Separation - -The existing `execute_operation_pipeline` function is renamed to **`execute_single_operation`** -in this spec. It remains the internal entry point for executing a single Cosmos DB operation -through the operation pipeline (region failover, session tokens, transport retry, auth, 429 -backoff, diagnostics). The feed operations layer calls `execute_single_operation` for each -individual Cosmos request within a plan. - -| Concern | Component | Location | -|---------|-----------|----------| -| Operation intent & payload | `CosmosOperation` | `models/cosmos_operation.rs` | -| Plan creation | `Planner` | `driver/plan/planner.rs` (new) | -| Plan model | `OperationPlan`, `PlanNode` | `driver/plan/plan.rs` (new) | -| Plan execution | `PlanExecutor` | `driver/plan/executor.rs` (new) | -| Continuation state | `ContinuationToken` | `models/continuation_token.rs` (new) | -| Per-node request execution | `execute_single_operation` | `driver/pipeline/` (existing) | - -### Open Issue: Re-Planning on Every Page - -Because `execute_operation` is stateless, the driver must re-plan the operation on every -call — including subsequent pages of a paginated feed. The Planner uses the continuation -token to reconstruct the plan state, but still performs the full planning step (PK range -resolution) on each page. - -For in-process callers (the common case), this is wasteful: the SDK crate calls -`execute_operation` in a loop, and the plan structure doesn't change between pages (Request -nodes handle partition splits internally by re-resolving EPK ranges). A future optimization -could allow `CosmosResponse` and/or `CosmosOperation` to carry a **cached `OperationPlan`** -so that subsequent requests skip re-planning when the plan is still valid. The cached plan -would be invalidated on account metadata changes, falling back to a full re-plan. - -This optimization is not required for correctness — the stateless model works correctly -today — but should be considered for performance-sensitive workloads with many small pages. - ---- - -## 3. CosmosOperation Changes - -### 3.1 OperationType Refactor - -`OperationType` currently carries no data and is `Copy`. Feed operations require variant-specific -data (query text, item lists, etc.). Rather than bloating `OperationType` with payload data — which -would break `Copy` and mix operation semantics with operation payload — we split the concern: - -- **`OperationType`** remains a lightweight, `Copy` enum describing operation semantics - (HTTP method, read-only, idempotent). Unchanged from today. - -- **`OperationPayload`** is a new enum carrying variant-specific data. It replaces the untyped - `body: Option>` field on `CosmosOperation`. - -```rust -/// Operation-specific payload data. -/// -/// Replaces the generic `body: Option>` on `CosmosOperation`. -/// Each variant carries exactly the data needed for its operation type. -#[derive(Clone, Debug)] -pub enum OperationPayload { - /// No payload needed (e.g., ReadItem, DeleteItem, ReadContainer). - None, - - /// Raw body bytes (e.g., CreateItem, UpsertItem, ReplaceItem). - /// The caller provides pre-serialized JSON. - Body(Vec), - - /// A SQL query against documents (`SELECT * [WHERE …]`, etc.). - /// The driver wraps this in the `application/query+json` envelope on - /// the wire; the caller does not pre-serialize it. - Query { - query: String, - parameters: Vec, - }, - - // Future variants: - // ReadMany { items: Vec<(String, PartitionKey)> }, - // ChangeFeed { mode, start_from, ... }, -} -``` - -`CosmosOperation` changes from: - -```rust -pub struct CosmosOperation { - operation_type: OperationType, - resource_type: ResourceType, - resource_reference: CosmosResourceReference, - partition_key: Option, - request_headers: CosmosRequestHeaders, - body: Option>, // ← removed -} -``` - -to: - -```rust -pub struct CosmosOperation { - operation_type: OperationType, - resource_type: ResourceType, - resource_reference: CosmosResourceReference, - target: OperationTarget, - request_headers: CosmosRequestHeaders, - payload: OperationPayload, -} -``` - -### 3.2 OperationTarget - -Partition targeting is currently a single `Option` field. Feed operations require -richer targeting. The targeting enum has three **mutually exclusive** modes: no partition scope, -a specific logical partition key, or an EPK range. An operation chooses exactly one of these — -it never combines a logical partition key with a feed range. - -| Variant | When the SDK picks it | What the driver does | -|---------|----------------------|----------------------| -| `None` | Account-/database-level operations (`CreateDatabase`, `ReadContainer`). | No PK routing; no EPK headers. | -| `PartitionKey(pk)` | Point operations and any single-partition feed operation (queries, change feed scoped to one logical partition). | Sends the raw PK header (`x-ms-documentdb-partitionkey`); routes to that PK's owning physical partition. **Bypasses query plan** for SQL queries — see [§5.2](#52-planning-logic-by-operation-type). | -| `FeedRange(fr)` | Cross-partition feed operations, including the default "whole container" case via `OperationTarget::all_ranges()`. | Resolves the FeedRange to one or more PK range IDs via `PartitionKeyRangeCache`; sets `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` headers per fetch. | - -Logical partition key targeting and feed range targeting are mutually exclusive at the type -level. A caller that wants to target a single logical partition uses -`OperationTarget::PartitionKey`. A caller that wants to target a slice of EPK space (one -physical partition, several adjacent ones, or the whole container) uses -`OperationTarget::FeedRange`. The SDK surface enforces the same exclusivity (see -[§10.6 SDK Option Plumbing](#106-sdk-option-plumbing)). - -```rust -/// How the operation is targeted to partitions. Variants are mutually exclusive. -#[derive(Clone, Debug)] -pub enum OperationTarget { - /// No partition targeting (account-level or database-level operations, - /// such as CreateDatabase or ReadContainer). - None, - - /// Target a specific logical partition key. - /// - /// Used for point operations (read, create, delete, upsert, replace) and - /// for single-partition feed operations (queries scoped to one logical - /// partition, single-partition change feed, etc.). The raw partition key - /// value is included in request headers and the request goes straight to - /// the gateway for the owning physical partition. No FeedRange / EPK - /// header is set. - PartitionKey(PartitionKey), - - /// Target a specific feed range. - /// - /// Used for feed operations that span one or more partitions. Uses the - /// `FeedRange` type, which represents a contiguous span of effective - /// partition key (EPK) space. See §3.2.1 below for the type's origin. - /// Use `OperationTarget::all_ranges()` for the whole container. - /// - /// The pipeline resolves the FeedRange to the owning PK range ID(s) via - /// the `PartitionKeyRangeCache` at execution time. - FeedRange(FeedRange), -} -``` - -#### 3.2.1 Migrating `FeedRange` from `azure_data_cosmos` - -The `FeedRange` type currently lives in `azure_data_cosmos::feed_range` (see -`sdk/cosmos/azure_data_cosmos/src/feed_range.rs`). It is the public, opaque, cross-SDK-compatible -representation of a contiguous EPK range, with stable wire formats (base64-encoded JSON via -`Display`/`FromStr`, and structured JSON via `Serialize`/`Deserialize`). - -This spec proposes **migrating `FeedRange` into the driver** (`azure_data_cosmos_driver`) so -that it can be used by `OperationTarget`, `ContinuationToken` resume state, and diagnostics -without crossing crate boundaries. The `azure_data_cosmos` crate then re-exports `FeedRange` -to preserve the existing public API. - -Rationale: -- The driver's `OperationTarget::FeedRange` variant must be public (`OperationTarget` is a - driver-public type), so it cannot use a `pub(crate)` driver-internal range type. -- `FeedRange` is already designed as a stable, cross-SDK-compatible type; promoting it to the - driver consolidates the canonical definition in one place. -- Other driver-internal range types (e.g., `EpkRange`) remain `pub(crate)` and continue to - serve their internal callers. - -Migration steps (out of scope for this spec, but for context): -1. Move `feed_range.rs` to `azure_data_cosmos_driver`. -2. Re-export `FeedRange` from `azure_data_cosmos` (e.g., `pub use azure_data_cosmos_driver::FeedRange;`). -3. Update internal driver code to consume `FeedRange` directly rather than its old location. - -```rust -impl OperationTarget { - /// The full key space: targets all partition key ranges. - pub fn all_ranges() -> Self { - Self::FeedRange(FeedRange::all_ranges()) - } -} -``` - -### 3.3 Factory Method Updates - -Existing factory methods are updated to use `OperationPayload` and `OperationTarget`: - -```rust -impl CosmosOperation { - /// Reads an item. - pub fn read_item(item: ItemReference) -> Self { - let partition_key = item.partition_key().clone(); - Self::new(OperationType::Read, item) - .with_target(OperationTarget::PartitionKey(partition_key)) - } - - /// Creates an item. Use `with_body()` to provide the document JSON. - pub fn create_item( - container: ContainerReference, - partition_key: PartitionKey, - ) -> Self { - let resource_ref = CosmosResourceReference::from(container) - .with_resource_type(ResourceType::Document) - .into_feed_reference(); - Self::new(OperationType::Create, resource_ref) - .with_target(OperationTarget::PartitionKey(partition_key)) - // Caller attaches body via .with_payload(OperationPayload::Body(...)) - } - - /// Runs a SQL query (`SELECT * [WHERE …]`, etc.). - /// - /// Without an explicit `with_target(...)`, the query targets the entire - /// container (`OperationTarget::all_ranges()`). To scope the query to a - /// single logical partition (which unlocks `ORDER BY` and other clauses - /// without a query plan — see §5.2), call - /// `.with_target(OperationTarget::PartitionKey(pk))`. To scope it to a - /// specific FeedRange, call `.with_target(OperationTarget::FeedRange(fr))`. - pub fn query( - container: ContainerReference, - query: impl Into, - parameters: Vec, - ) -> Self { - let resource_ref = CosmosResourceReference::from(container) - .with_resource_type(ResourceType::Document) - .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) - .with_target(OperationTarget::all_ranges()) - .with_payload(OperationPayload::Query { - query: query.into(), - parameters, - }) - } -} -``` - -### 3.4 Backward Compatibility - -The `body: Option>` field is removed and replaced with `payload: OperationPayload`. -Factory methods that previously required `.with_body(...)` now accept the body in the factory -method or via `.with_payload(...)`. A convenience method `with_body(Vec)` can be kept as -sugar for `with_payload(OperationPayload::Body(...))`. - -The transport pipeline's request builder must be updated to extract body bytes from -`OperationPayload` when constructing the Cosmos request. For `Body` variants, this is -straightforward. For `None`, no body is sent. Future payload variants (Query, ReadMany) -will be handled by the Planner before reaching the transport pipeline. - ---- - -## 4. Operation Plans - -### 4.1 Plan Model - -An `OperationPlan` describes the nodes needed to execute an operation. The Planner builds an -Operation Plan which is made up of Nodes. Each Node represents an operation in the pipeline. - -Rust's ownership model does not lend itself well to owning tree structures with parent-child -references. Instead, the plan uses a **flat list of nodes** with index-based references: - -- **`NodeId`** is an offset into the plan's node list, used for parent-child relationships. -- **`NodeRange`** is a `[start, end)` pair of `NodeId` values representing a contiguous slice - of children, avoiding a separate `Vec` heap allocation. -- Nodes are stored **bottom-up**: child nodes always appear before their parents in the list. - This makes `NodeId` values stable and deterministic — the same plan input always produces - the same node ordering. - -```rust -/// Index of a node within an `OperationPlan::Graph`'s node list. -/// -/// NodeIds are stable within a plan: the same inputs produce the same -/// node ordering. Children always have lower NodeIds than their parents -/// (bottom-up invariant). -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) struct NodeId(u32); - -/// A contiguous range of node indices `[start, end)`. -/// -/// Used to reference a slice of children without a separate heap allocation. -/// Children in a NodeRange are always contiguous in the node list because -/// they are built together by the planner. -#[derive(Clone, Copy, Debug)] -pub(crate) struct NodeRange { - pub start: NodeId, - pub end: NodeId, -} - -impl NodeRange { - pub fn len(&self) -> usize { - (self.end.0 - self.start.0) as usize - } - - pub fn is_empty(&self) -> bool { - self.start == self.end - } - - pub fn iter(&self) -> impl Iterator { - (self.start.0..self.end.0).map(NodeId) - } -} -``` - -```rust -/// A plan for executing an operation. -/// -/// Plans range from trivial (single node for a point read) to multi-node -/// (sequential drain across partition key ranges). The plan is created -/// by the Planner and executed by the PlanExecutor. -pub(crate) enum OperationPlan { - /// A single-node plan, stored inline. No heap allocation. - /// Used for point operations and single-partition feed operations. - SingleNode(PlanNode), - - /// A multi-node plan stored as a flat list of nodes. - /// Nodes are stored bottom-up: children appear before parents. - /// Used for cross-partition feed operations (e.g., a cross-partition `SELECT *` query). - Graph { - /// The flat list of nodes. Children appear before parents. - nodes: Vec, - /// The root node of the plan (always the last node in the list). - root: NodeId, - }, -} -``` - -```rust -/// A node in an operation plan. -/// -/// Nodes reference each other via `NodeId` and `NodeRange` within the -/// flat node list. Composite nodes (SequentialDrain) reference child nodes; -/// leaf nodes (Request) have no children. -pub(crate) enum PlanNode { - /// Execute a single Cosmos request via the operation pipeline. - /// - /// Each Request node targets a specific **EPK range** (not a PK range ID). - /// At execution time, the node resolves its EPK range to the current PK - /// range ID(s) via the `PartitionKeyRangeCache`. The Request node handles - /// both **splits** (its EPK range maps to multiple child PK ranges) and - /// **merges** (its EPK range falls entirely within a larger merged PK - /// range) by issuing requests against the appropriate current PK ranges. - /// In the merge case, the Request must include EPK min/max headers so the - /// server only returns items inside the original range. The next time - /// the plan is generated, EPK ranges will reflect the new topology and - /// the plan resumes with the new ranges. - Request { - /// The operation to execute, targeted to a specific EPK range. - /// Wrapped in `Arc` so that sibling Request nodes can share the base - /// operation without cloning the full payload (headers, resource - /// reference, etc.). - operation: Arc, - /// Options for this fetch. - options: OperationOptions, - /// The EPK range this fetch targets. - feed_range: FeedRange, - /// Server-provided continuation token for this range, if resuming. - continuation: Option, - }, - - /// Sequential cross-partition drain. - /// - /// Enumerates child Request nodes in EPK order, draining each partition - /// completely before moving to the next. Each page comes from exactly - /// one partition — pages do not span partition boundaries. - /// - /// Within each partition, items are returned in (PartitionKey, ID) - /// ascending order (the natural server sort order). - SequentialDrain { - /// Child Request nodes, ordered by EPK range. - /// References a contiguous range in the plan's node list. - children: NodeRange, - }, - - // Future variants: - // UnorderedMerge { children: NodeRange }, - // OrderedMerge { children: NodeRange, order_by: ... }, - // Aggregate { children: NodeRange, aggregation: ... }, -} -``` - -### 4.2 Bottom-Up Invariant - -The flat node list is always built **bottom-up**: leaf nodes (Request) are pushed first, -then their parent (SequentialDrain) is pushed after them. This produces a deterministic layout where -`NodeId` values are stable for a given set of inputs. - -For a cross-partition `SELECT *` plan over 3 partitions, the node list looks like: - -```text -Index Node -───── ────────────────────────────────────────── - 0 Request { feed_range: ["","55"), ... } - 1 Request { feed_range: ["55","AA"), ... } - 2 Request { feed_range: ["AA","FF"), ... } - 3 SequentialDrain { children: NodeRange(0..3) } - -root = NodeId(3) -``` - -The `NodeRange(0..3)` for the SequentialDrain's children is a zero-cost reference to the contiguous -slice of Request nodes. No `Vec` allocation is needed. - -### 4.3 Plan Examples - -#### Point Operation (ReadItem) - -```text -SingleNode(Request { operation: read_item, feed_range: pk_epk, continuation: None }) -``` - -A `SingleNode` plan with one `Request` node. The executor runs it directly, gets a -`CosmosResponse`, done. No heap allocation. - -#### Cross-Partition `SELECT *` Query - -```text -Graph { - nodes: [ - 0: Request { feed_range: ["","55"), continuation: None }, - 1: Request { feed_range: ["55","AA"), continuation: None }, - 2: Request { feed_range: ["AA","FF"), continuation: None }, - 3: SequentialDrain { children: NodeRange(0..3) }, - ], - root: NodeId(3), -} -``` - -The executor processes partitions sequentially: -1. Request all pages from EPK range `["","55")` until that partition is drained. -2. Move to EPK range `["55","AA")`, fetch all pages. -3. Move to EPK range `["AA","FF")`, fetch all pages. - -Each `execute_operation` call produces exactly **one page** from the currently-active -partition. When a partition is fully drained (server returns no continuation), the next -call starts the next partition. A continuation token is returned after each page until -all partitions are exhausted. - -#### Cross-Partition `SELECT *` — Resumed from Continuation - -When resuming from a continuation token that says "active range is `["55","AA")` with -server token `xyz`", the Planner skips already-drained ranges and rebuilds the plan -starting from the active range: - -```text -Graph { - nodes: [ - 0: Request { feed_range: ["55","AA"), continuation: Some("xyz") }, - 1: Request { feed_range: ["AA","FF"), continuation: None }, - 2: SequentialDrain { children: NodeRange(0..2) }, - ], - root: NodeId(2), -} -``` - -Only the remaining partitions are in the plan. The first Request carries the server -continuation from the token. - -### 4.4 SingleNode Optimization - -For point operations, the plan model MUST be zero or near-zero overhead compared to the current -direct `execute_single_operation` call. The `OperationPlan::SingleNode` variant ensures this: - -- **No heap allocation**: The single `PlanNode` is stored inline in the enum, not in a `Vec`. -- **No graph traversal**: The executor matches on `SingleNode` and directly calls - `execute_single_operation`. - ---- - -## 5. Planner - -### 5.1 Responsibilities - -The Planner transforms a `CosmosOperation` into an `OperationPlan`. For a cross-partition -`SELECT *` query, this is synchronous: resolve partition key ranges and build a -`SequentialDrain` node over `Request` children. - -```rust -pub(crate) struct Planner<'a> { - /// Access to the PK range cache for partition resolution. - pk_range_cache: &'a PartitionKeyRangeCache, -} - -impl<'a> Planner<'a> { - /// Creates an operation plan from a CosmosOperation. - /// - /// For point operations, this is synchronous and trivial. - /// For cross-partition `SELECT *`, this resolves PK ranges and builds a SequentialDrain plan. - pub async fn plan( - &self, - operation: &CosmosOperation, - options: &OperationOptions, - continuation: Option<&ContinuationToken>, - // Callback for fetching PK ranges (keeps Planner transport-decoupled). - fetch_pk_ranges: impl Fn(...) -> ..., - ) -> azure_core::Result { - // ... - } -} -``` - -### 5.2 Planning Logic by Operation Type - -| Operation | Targeting | Plan Strategy | -|-----------|-----------|---------------| -| ReadItem, DeleteItem, etc. | `PartitionKey` | Single `Request` node. SingleNode. | -| CreateDatabase, ReadContainer, etc. | `None` | Single `Request` node. SingleNode. | -| `SELECT * [WHERE …]`, single partition | `PartitionKey` | Single `Request` node. Paginated. **Fast-path: no query plan fetch.** See [§5.2.1](#521-single-partition-query-fast-path). | -| Single-partition query with `ORDER BY` / `GROUP BY` / aggregates / `OFFSET LIMIT` / etc. | `PartitionKey` | Same as above — pass through verbatim. | -| `SELECT * [WHERE …]`, cross-partition | `FeedRange` (`all_ranges()` or a caller-supplied range) | Resolve PK ranges → `SequentialDrain` over N `Request` nodes. Sequential. No query plan needed for `SELECT * [WHERE …]`. | -| Cross-partition query with `ORDER BY` / aggregates / vector / hybrid | `FeedRange` | **Out of scope for this spec.** Requires backend query plan retrieval — see [§12.2](#122-cross-partition-queries). | - -#### 5.2.1 Single-Partition Query Fast-Path - -When an operation has `OperationTarget::PartitionKey(pk)` *and* an `OperationPayload::Query`, -the Planner produces a trivial `SingleNode` plan and the executor sends the request directly -to the gateway against the owning physical partition. **No query plan is fetched** and **no -client-side rewriting** is performed: - -- The query body is forwarded as-is in the `application/query+json` envelope. -- Arbitrary single-partition SQL is supported, including `ORDER BY`, `GROUP BY`, `DISTINCT`, - aggregates (`COUNT`, `SUM`, …), `OFFSET / LIMIT`, and `TOP` — the gateway evaluates them - inside the single physical partition and the result page is correct as returned. -- Vector search (`VectorDistance`) and hybrid search clauses are *also* accepted on this - path today because they collapse to a single-partition execution. They produce correct - results when the entire vector / hybrid evaluation fits in one partition, but see the - caveat below. -- The continuation token, if any, is the server's opaque continuation for that one partition - (a `ResumeState::Request`). - -**Why this is safe.** A query whose data set is bounded to a single logical partition is -already evaluated in a single backend execution context. Aggregates and ordering operators -are correct without a client-side merge step, so the driver does not need a query plan to -drive correctness. - -**Future change — query plan fetched even for single-partition queries.** Vector and hybrid -queries can become incorrect on the single-partition fast-path in edge cases (e.g., the -backend returning per-partition truncated candidate lists where the global merge requires -the query plan's score-rewriting hints). To keep the fast-path correct as new query -features ship, the driver will eventually start **fetching a query plan** for -single-partition queries too. The plan will be cached, and for queries the plan classifies -as "passthrough" the execution path is unchanged. This is a future change and is not -required to ship the in-scope `SELECT * [WHERE …]` work. - -### 5.3 Pseudo-Code: Building a Trivial Plan - -The following pseudo-code illustrates how the Planner constructs a plan for a point -operation or single-partition feed: - -```rust -// PSEUDO-CODE — illustrative, not compilable -fn plan_trivial(operation: CosmosOperation, options: OperationOptions) -> OperationPlan { - OperationPlan::SingleNode(PlanNode::Request { - feed_range: operation.target().as_epk_range(), - operation: Arc::new(operation), - options, - continuation: None, - }) -} -``` - -No PK range resolution is needed. The operation is wrapped in a single `Request` node. - -### 5.4 Pseudo-Code: Building a Cross-Partition `SELECT *` Plan - -The following pseudo-code illustrates how the Planner constructs a cross-partition -`SELECT *` plan, including resume from a continuation token: - -```rust -// PSEUDO-CODE — illustrative, not compilable -fn plan_read_feed( - operation: &CosmosOperation, - pk_ranges: &[PartitionKeyRange], - continuation: Option<&ContinuationToken>, -) -> OperationPlan { - // Determine where to start: either from a continuation token or the beginning. - let (start_epk, server_token) = match continuation { - Some(token) => { - let state = token.resume_state(); - (state.epk_min(), state.server_token().cloned()) - } - None => (EffectivePartitionKey::MIN, None), - }; - - // Build Request nodes bottom-up, one per PK range that hasn't been drained. - let shared_op = Arc::new(create_fetch_from(operation)); - let mut nodes = Vec::new(); - - let remaining_ranges = pk_ranges - .iter() - .filter(|r| r.max_epk() > start_epk); - - let mut is_first_remaining = true; - for range in remaining_ranges { - let continuation = if is_first_remaining { - is_first_remaining = false; - server_token.clone() - } else { - None - }; - - nodes.push(PlanNode::Request { - operation: Arc::clone(&shared_op), - options: derive_request_options(range), - feed_range: range.feed_range(), - continuation, - }); - } - - // Push the SequentialDrain node after all its children (bottom-up invariant). - let children = NodeRange { - start: NodeId(0), - end: NodeId(nodes.len() as u32), - }; - nodes.push(PlanNode::SequentialDrain { children }); - - let root = NodeId(nodes.len() as u32 - 1); - OperationPlan::Graph { nodes, root } -} -``` - -Key points: -- Request nodes are pushed first (children), then the SequentialDrain (parent) — maintaining the - bottom-up invariant. -- On resume, ranges left of the continuation's EPK min are skipped entirely. The first - remaining Request carries the server token from the continuation. -- All Request nodes share the base operation via `Arc`, avoiding clones of headers and - resource references. - -### 5.5 Resuming from a Continuation Token - -When a `ContinuationToken` is provided, the Planner validates it (version, container RID, -operation kind), resolves the current partition key ranges, and uses the token's resume -state to reconstruct the plan at the correct position. - -The resume algorithm for `SequentialDrain` is described in [§7.3 Resume Strategy](#73-resume-strategy). - -### 5.6 Future Extensions - -The Planner architecture supports future operations without redesign: - -- **ReadMany**: Group items by PK range, create concurrent `Request` nodes with an - `UnorderedMerge` parent. Requires adding concurrency support to the PlanExecutor. -- **Cross-partition query**: Request a backend query plan, create `Request` nodes per - partition, optionally with `OrderedMerge` for ORDER BY queries. -- **Change feed**: Create `Request` nodes scoped to feed ranges with change-feed-specific - continuation state. Add a parent merge node based on change-feed merge semantics. -- **Concurrency management**: All plan nodes receive a **concurrency permit** (semaphore - token) during execution. For a cross-partition `SELECT *`, the executor holds a single - permit — sequential by design. Future operations (ReadMany, cross-partition queries) will acquire multiple - permits from a shared semaphore, allowing the PlanExecutor to control the degree of - parallelism across nodes without changing the plan model. - ---- - -## 6. Plan Executor - -### 6.1 Core Execution Loop - -The Plan Executor runs an `OperationPlan` and produces one page of results per call. - -```rust -pub(crate) struct PlanExecutor; - -impl PlanExecutor { - /// Executes one page of the plan, producing a `CosmosResponse`. - /// - /// The response includes a continuation token if more pages are available. - /// Each call executes exactly one Cosmos request to one partition. - pub async fn execute( - plan: &OperationPlan, - driver_context: &DriverContext, - diagnostics: &mut DiagnosticsContextBuilder, - ) -> azure_core::Result { - // ... - } -} -``` - -The following pseudo-code illustrates the core execution loop for a `SequentialDrain` plan. -Function names are descriptive; their implementations are not shown. - -```rust -// PSEUDO-CODE — illustrative, not compilable -async fn execute_plan( - plan: &OperationPlan, - driver_context: &DriverContext, - diagnostics: &mut DiagnosticsContextBuilder, -) -> Result { - match plan { - OperationPlan::SingleNode(request) => { - // Point ops and single-partition feeds: execute directly. - execute_request_node(request, driver_context, diagnostics).await - } - OperationPlan::Graph { nodes, root } => { - let root_node = &nodes[root.0 as usize]; - execute_node(root_node, nodes, driver_context, diagnostics).await - } - } -} - -async fn execute_node( - node: &PlanNode, - all_nodes: &[PlanNode], - driver_context: &DriverContext, - diagnostics: &mut DiagnosticsContextBuilder, -) -> Result { - match node { - PlanNode::Request { .. } => { - execute_request_node(node, driver_context, diagnostics).await - } - PlanNode::SequentialDrain { children } => { - // Find the active child: the first Request that hasn't been drained. - // On a fresh plan, this is children.start. On resume, the Planner - // has already pruned drained partitions, so children.start is the - // active one. - let active_id = children.start; - let active_request = &all_nodes[active_id.0 as usize]; - - // Acquire a concurrency permit (sequential: only one permit). - let _permit = acquire_concurrency_permit(driver_context).await; - - // Execute one page from the active partition. - let response = execute_request_node( - active_request, driver_context, diagnostics - ).await?; - - // Build the continuation token based on what happened. - let continuation = build_drain_continuation( - &response, active_request, active_id, children, all_nodes - ); - - Ok(response.with_continuation(continuation)) - } - } -} -``` - -### 6.2 Backpressure & Cancellation - -- **Caller drops the future**: The in-flight `execute_single_operation` future is - cancelled via standard Rust drop semantics. -- **Memory bounds**: Each call buffers at most one page of results. -- **Cancellation mid-page**: If the caller cancels during a page fetch, the continuation - token from the *previous* completed call remains valid for resumption. - -### 6.3 OperationOverrides - -A `CosmosOperation` represents the **stable, fetch-independent** part of a Cosmos request: -operation type, resource reference, partition targeting, payload, and any caller-supplied -headers. For a feed operation, the same `CosmosOperation` is reused across every page and -every EPK range — only a small set of headers and parameters differ from one fetch to the -next. - -To avoid cloning the full `CosmosOperation` per fetch (and to avoid holding -`PlanNode::Request` open after a single fetch is done), the executor passes an -`OperationOverrides` struct to `execute_single_operation` alongside an -`&CosmosOperation` reference. Each invocation produces a fresh request by composing the -shared base operation with the per-fetch overrides. - -```rust -/// Per-fetch overrides applied on top of a shared `CosmosOperation`. -/// -/// Strictly limited to the headers / parameters that legitimately differ -/// between successive fetches against the same logical operation. Anything -/// not on this list belongs on the `CosmosOperation` itself. -#[derive(Clone, Debug, Default)] -pub struct OperationOverrides { - /// EPK range the request is scoped to. When set, the transport layer - /// emits `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` headers - /// and routes to the PK range(s) currently owning that EPK slice. - /// - /// Only valid when the base operation's target is `OperationTarget::FeedRange` - /// or `OperationTarget::all_ranges()`. Ignored for `OperationTarget::PartitionKey` - /// (a logical PK already pins the request to one physical partition). - pub feed_range: Option, - - /// Server-provided continuation token from the previous page of the - /// same fetch loop. Emitted as `x-ms-continuation`. `None` for the - /// first page. - pub continuation: Option, - - /// Per-fetch override for the maximum item count hint - /// (`x-ms-max-item-count`). Falls back to the value carried by - /// `OperationOptions::max_item_count` when unset. - pub max_item_count: Option, -} -``` - -The `execute_single_operation` entry point therefore becomes: - -```rust -pub(crate) async fn execute_single_operation( - &self, - operation: &CosmosOperation, - options: &OperationOptions, - overrides: &OperationOverrides, - diagnostics: &mut DiagnosticsContextBuilder, -) -> azure_core::Result { - // Apply overrides to the request being built from `operation`. - // Run the existing pipeline (region failover, session, retry, transport). -} -``` - -#### What overrides MAY carry - -The set is deliberately small and frozen by this spec: - -| Field | Purpose | Header / wire effect | -|-------|---------|----------------------| -| `feed_range` | Per-fetch EPK targeting (split / merge handling, drain progression) | `x-ms-documentdb-epk-min`, `x-ms-documentdb-epk-max`, PK-range routing | -| `continuation` | Resume the same partition mid-stream | `x-ms-continuation` | -| `max_item_count` | Per-fetch page-size hint | `x-ms-max-item-count` | - -#### What overrides MUST NOT carry - -To keep `OperationOverrides` predictable and cheap to validate, it explicitly does NOT -carry anything that changes operation identity, semantics, or auth. The following stay on -the base `CosmosOperation` (or on `OperationOptions`) and are an error to put on overrides: - -- Operation type, resource type, resource reference. -- Partition key value (the logical PK is part of the operation's target). -- Request body / payload (`OperationPayload`). -- Consistency level, session token, throughput control group, retry policy. -- Authentication or any other header that affects request signing. - -#### Plan-node integration - -`PlanNode::Request` stores the per-fetch *intent* (the EPK range that this leaf is -responsible for, plus any server continuation it was resumed with). At execution time, the -executor materializes that into an `OperationOverrides` and runs: - -```rust -// PSEUDO-CODE -async fn execute_request_node( - node: &PlanNode, // PlanNode::Request - driver_context: &DriverContext, - diagnostics: &mut DiagnosticsContextBuilder, -) -> Result { - let PlanNode::Request { operation, options, feed_range, continuation } = node else { - unreachable!() - }; - let overrides = OperationOverrides { - feed_range: Some(feed_range.clone()), - continuation: continuation.clone(), - max_item_count: options.max_item_count, - }; - driver_context - .execute_single_operation(operation.as_ref(), options, &overrides, diagnostics) - .await -} -``` - -Because `operation` is an `Arc` shared across every Request leaf in the -plan (see §5.4), and because the executor only synthesizes a tiny `OperationOverrides` -per fetch, the same `CosmosOperation` can drive an arbitrary number of EPK-range fetches -without being cloned, mutated, or re-built. The base operation outlives the entire feed -operation; overrides are scratch state owned by a single fetch and thrown away after the -response is returned. - -This is also what makes splits / merges cheap: when the Request leaf re-resolves its EPK -range to new PK range IDs (see [§9.1](#91-partition-split-during-execution)), it issues -follow-up calls to `execute_single_operation` against the **same** `CosmosOperation`, -varying only the `feed_range` (and where applicable the `continuation`) inside the -`OperationOverrides`. - ---- - -## 7. Continuation Tokens - -### 7.1 Design Principles - -Continuation tokens must be: - -1. **Durable across SDK versions** — A token produced by SDK version N must be usable by - SDK version N+k. Tokens may be stored durably (e.g., in a database) or transiently - (e.g., in a URL parameter) and must survive SDK upgrades. Newer SDKs MUST support reading - tokens from older SDKs. Changing the token format dramatically increases complexity because - SDKs must support versions `current - x`. - -2. **Versioned** — Tokens carry a version field. Revving the version is the option of last - resort. New `ResumeState` variants can be added without changing the version, because - `serde`'s tagged enum deserialization handles unknown variants gracefully (they fail to - parse, which is the correct behavior when an older SDK encounters a token from a newer one). - - **Version preservation across resume:** When resuming from an input continuation token, - the SDK MUST emit any output continuation token using the **same version** as the input - token. This guarantees that a caller persisting the token across pages does not observe - a version "drift" mid-operation: a token started at version N continues to round-trip as - version N until the operation completes, even if the SDK has since added support for a - higher version. The SDK only emits the latest version when no input token is provided. - -3. **Aim for O(1) size** — Token size should ideally be constant regardless of partition - count. For cross-partition `SELECT *`, only the state of the currently-active partition is - stored, and other - partitions' positions are reconstructed from EPK bounds on resume. However, per-partition - state MAY become necessary for certain node types (e.g., change feed requires per-range - tokens). It is up to each node type to define its own resume state and thus determine - the size of that state. - -4. **Composable** — Each node type defines its own `ResumeState` variant. New node types - add new variants without breaking the token structure for existing node types. The resume - state is extensible via serde's tagged enum — unknown variants from newer SDKs fail to - deserialize correctly in older SDKs. - -5. **Operation-bound** — Tokens include an operation kind to prevent replaying a token from - one operation type against a different operation on the same container. - -### 7.2 Token Structure - -```rust -/// A typed continuation token for resuming a feed operation. -/// -/// Opaque to callers. Serializes to a string via `Display` and -/// deserializes via `FromStr`. The internal representation is -/// versioned and validated on deserialization. -#[derive(Clone, Debug)] -pub struct ContinuationToken { - inner: ContinuationTokenInner, -} - -/// Internal token representation (not public). -#[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -struct ContinuationTokenInner { - /// Token format version for forward/backward compatibility. - version: u32, - - /// Container identity (RID, not name) to detect container recreation. - container_rid: String, - - /// The operation kind this token was produced for. - /// Prevents replaying tokens across incompatible operations. - operation_kind: String, - - /// The resume state, defined by the node type that produced it. - resume: ResumeState, -} -``` - -```rust -/// Resume state for a plan node. -/// -/// Each variant captures the state for one node type. New variants -/// can be added as new node types are introduced, without changing -/// the token version. -#[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "type")] -enum ResumeState { - /// Sequential cross-partition drain. - /// - /// Tracks the current feed range position: `epk_min` and `epk_max` - /// identify the active range, and `server_token` holds the server - /// continuation for that range (if mid-partition). - /// - /// On resume, ranges with max ≤ `epk_min` are skipped (already drained). - /// The range matching `[epk_min, epk_max)` resumes from `server_token`. - /// Ranges after `epk_max` start fresh. - #[serde(rename = "sequentialDrain")] - SequentialDrain(SequentialDrainState), - - /// A single partition request, mid-stream or just completed. - /// Used as the root resume state for single-partition feed operations. - #[serde(rename = "request")] - Request(RequestState), - - // Future variants (added without changing token version): - // - // /// Change feed — per-range continuation tokens. - // #[serde(rename = "changeFeed")] - // ChangeFeed(ChangeFeedState), - // - // /// Ordered merge for ORDER BY queries. - // #[serde(rename = "orderedMerge")] - // OrderedMerge(OrderedMergeState), -} - -/// Resume state for a SequentialDrain node. -#[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -struct SequentialDrainState { - /// EPK minimum of the current active feed range. - /// All ranges with max ≤ this value have been fully drained. - epk_min: String, - - /// EPK maximum of the current active feed range. - epk_max: String, - - /// Server-provided continuation token for this range. - /// `None` when this range was just completed and the cursor - /// is at the boundary to the next range. - #[serde(skip_serializing_if = "Option::is_none")] - server_token: Option, -} - -/// Resume state for a single-partition Request node. -#[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -struct RequestState { - /// EPK min inclusive of the target range. - epk_min: String, - - /// EPK max exclusive of the target range. - epk_max: String, - - /// Server-provided continuation token for this range. - #[serde(skip_serializing_if = "Option::is_none")] - server_token: Option, -} -``` - -#### Wire-format field reference - -| Rust type | Field | Wire key | Content | -|-----------|-------|----------|---------| -| `ContinuationTokenInner` | `version` | `version` | Format version (integer) | -| | `container_rid` | `containerRid` | Container RID (string) | -| | `operation_kind` | `operationKind` | Operation kind (e.g., `"query"`) | -| | `resume` | `resume` | `ResumeState` (tagged union) | -| `SequentialDrainState` | *(tag)* | `type` | `"sequentialDrain"` | -| | `epk_min` | `epkMin` | EPK min inclusive (hex string) | -| | `epk_max` | `epkMax` | EPK max exclusive (hex string) | -| | `server_token` | `serverToken` | Server continuation (omitted if null) | -| `RequestState` | *(tag)* | `type` | `"request"` | -| | `epk_min` | `epkMin` | EPK min inclusive (hex string) | -| | `epk_max` | `epkMax` | EPK max exclusive (hex string) | -| | `server_token` | `serverToken` | Server continuation (omitted if null) | - -### 7.3 Resume Strategy - -On resume, the Planner validates the token and uses the resume state to reconstruct the -plan at the correct position. - -#### `SequentialDrain` (sequential cross-partition) - -The `SequentialDrainState` tracks the cursor position via EPK bounds. On resume: - -| Partition position | Action | -|--------------------|--------| -| **Left of active** (range max ≤ `epk_min`) | Skip — already drained. | -| **Active range** (matches `[epk_min, epk_max)`) | Resume using `server_token`. If `server_token` is `None`, the range is complete — skip it and start the next range fresh. | -| **Right of active** (range min ≥ `epk_max`) | Start fresh (not yet visited). | - -If the active range has split since the token was created, the Planner uses the EPK bounds -to assign the server continuation to the appropriate child range. The `server_token` applies -to the first sub-range that overlaps the original EPK bounds; subsequent sub-ranges start -fresh. - -#### `Request` (leaf — single partition) - -A bare `RequestState` at the root (no wrapping `SequentialDrain`) represents a single-partition operation. -Resume uses `server_token` directly. - -### 7.4 Serialization - -`ContinuationToken` implements `Display` and `FromStr`. The wire format is base64url-encoded -JSON (using the URL-safe alphabet with no padding): - -```rust -impl Display for ContinuationToken { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let json = serde_json::to_vec(&self.inner).map_err(|_| fmt::Error)?; - let encoded = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(&json); - f.write_str(&encoded) - } -} - -impl FromStr for ContinuationToken { - type Err = azure_core::Error; - - fn from_str(s: &str) -> Result { - let decoded = base64::engine::general_purpose::URL_SAFE_NO_PAD - .decode(s) - .map_err(|e| azure_core::Error::new(ErrorKind::DataConversion, e))?; - let inner: ContinuationTokenInner = serde_json::from_slice(&decoded) - .map_err(|e| azure_core::Error::new(ErrorKind::DataConversion, e))?; - // Version check - if inner.version > CURRENT_TOKEN_VERSION { - return Err(azure_core::Error::with_message( - ErrorKind::DataConversion, - "continuation token version is newer than this SDK supports", - )); - } - Ok(Self { inner }) - } -} -``` - -#### Sample Tokens - -**Cross-partition `SELECT *`, mid-stream on partition ["55","AA")** - -JSON (before base64 encoding): -```json -{ - "version": 1, - "containerRid": "dbs/abc/colls/def", - "operationKind": "query", - "resume": { - "type": "sequentialDrain", - "epkMin": "55", - "epkMax": "AA", - "serverToken": "+RID:~abc123#RT:1#TRC:10#ISV:2#IEO:65551" - } -} -``` - -On resume, the Planner sees the drain cursor at `["55","AA")`. Ranges with max ≤ `"55"` are -skipped. The range `["55","AA")` resumes from `serverToken`. Ranges after `"AA"` start fresh. - -**Cross-partition `SELECT *`, target partition just completed (cursor at boundary)** - -```json -{ - "version": 1, - "containerRid": "dbs/abc/colls/def", - "operationKind": "query", - "resume": { - "type": "sequentialDrain", - "epkMin": "55", - "epkMax": "AA" - } -} -``` - -`serverToken` is absent, meaning partition `["55","AA")` is fully drained. The Planner -skips everything up to and including this range, and starts the next partition fresh. - -**Single-partition feed, mid-stream** - -A bare `RequestState` at the root (no wrapping layer): - -```json -{ - "version": 1, - "containerRid": "dbs/abc/colls/def", - "operationKind": "query", - "resume": { - "type": "request", - "epkMin": "55", - "epkMax": "AA", - "serverToken": "-RID:QmFzZTY0#RT:3#TRC:50" - } -} -``` - -### 7.5 Compatibility Contract - -A continuation token is **invalidated** by: - -1. **Container recreation** — The token's `containerRid` won't match the new container's RID. -2. **Token version mismatch** — A token produced by a newer SDK version may not be readable - by an older version. Newer SDKs MUST support tokens from older versions (backward compat). -3. **Operation kind mismatch** — The token's `operationKind` must match the operation being - resumed. A `query` token cannot be used with a query operation. -4. **Structure mismatch** — If the re-created plan produces a different node type than the - token's `ResumeState` variant (e.g., a `drain` token for a single-partition operation), - the token is rejected. - -A continuation token **survives**: - -1. **Partition splits and merges** — The token stores EPK bounds, not PK range IDs. On resume, - the Planner re-resolves EPK bounds to current PK range IDs. After a split, an original - range maps to multiple child ranges; after a merge, multiple original ranges map to a - single combined range. Either way, the EPK bounds in the token still identify the exact - slice of the EPK space that has (or hasn't) been drained. -2. **SDK version upgrades** — The token is versioned. Older token versions are supported by - newer SDKs (backward compatible deserialization). -3. **Process boundaries** — The token is a self-contained string, safe to send to a browser - and back. -4. **Durable storage** — Tokens can be stored in databases and used across process restarts, - machine migrations, and SDK upgrades. - -### 7.6 What the Token Does NOT Encode - -- **Per-range state for all partitions (for SequentialDrain)** — Only the active range's state is - stored. Other partitions' positions are reconstructed from the EPK bounds on resume. Other - node types may store per-range state if needed (see §12.3 Change Feed). -- **Query text or parameters** — The caller must provide an equivalent `CosmosOperation`. -- **Session tokens** — Session consistency is not preserved across process boundaries via - the continuation token. -- **Container name or database name** — Only the RID is stored. -- **PK range IDs** — Only EPK bounds are stored, which are stable across partition splits. - PK range IDs are resolved dynamically from the `PartitionKeyRangeCache` on resume. - ---- - -## 8. Diagnostics Structure - -### 8.1 Design Principle - -The driver does **not** create OpenTelemetry spans or any other telemetry artifacts. Instead, -each call to `execute_operation` returns a `DiagnosticsContext` on the `CosmosResponse` -containing a structured hierarchy of timing and request data. The higher-level SDK crate uses -this data to create OTEL spans, log entries, or any other telemetry it chooses. - -This separation ensures the driver remains transport- and telemetry-agnostic while providing -enough detail for the SDK to reconstruct the full execution timeline. - -### 8.2 Hierarchy: Plan → Node → Request - -Each `execute_operation` call produces a `DiagnosticsContext` with a hierarchical view of the -operation plan's execution. The hierarchy mirrors the plan graph: composite nodes (SequentialDrain) -contain child node diagnostics, and leaf nodes (Request) contain Cosmos request diagnostics. - -```text -DiagnosticsContext - ├── activityId, totalDurationMs, totalRequestCharge - │ - └── operationPlan (NodeDiagnostics) - ├── nodeType: "sequentialDrain" - ├── startedAt, completedAt, durationMs - │ - └── children[] - └── [0] NodeDiagnostics - ├── nodeType: "request" - ├── epkRange: { min, max } - ├── startedAt, completedAt, durationMs - ├── requestCharge - ├── outcome: "success" | "failed" - │ - ├── requests[] - │ ├── [0] RequestDiagnostics (initial attempt) - │ └── [1] RequestDiagnostics (retry, if any) - │ - └── children[] (empty for Request) -``` - -Every node holds a list of diagnostics from the child nodes it triggered (`children`), -as well as its own Cosmos requests. This makes the diagnostics structure recursive and -directly mirrors the plan graph. - -For point operations (SingleNode plan), the hierarchy collapses: the `operationPlan` -is a single Request node with its requests and no children. The existing flat `requests()` -accessor is preserved for backward compatibility by flattening the tree. - -### 8.3 Hierarchical Diagnostics Types - -```rust -/// Diagnostics for a single plan node's execution. -/// -/// This type is recursive: composite nodes (SequentialDrain) contain child -/// `NodeDiagnostics` entries, mirroring the plan graph structure. -pub struct NodeDiagnostics { - /// What kind of node this was. - node_type: NodeType, - - /// The EPK range targeted by this node (for Request nodes). - /// `None` for non-Request nodes. - feed_range: Option, - - /// When the node started executing. - started_at: Instant, - - /// When the node completed. - completed_at: Instant, - - /// Duration in milliseconds. - duration_ms: u64, - - /// Total RU charge for this node (including children). - request_charge: RequestCharge, - - /// Individual Cosmos request diagnostics for this node. - /// Empty for non-leaf nodes that don't directly issue Cosmos requests. - /// May contain multiple entries due to retries within the node. - requests: Vec, - - /// Child node diagnostics, for composite nodes (SequentialDrain, future merge nodes). - /// Empty for leaf nodes (Request). - /// For SequentialDrain, contains only the nodes that were executed in this call - /// (typically one Request node per page). - children: Vec, - - /// Outcome of this node's execution. - outcome: NodeOutcome, -} - -/// Outcome of a plan node's execution. -#[derive(Clone, Debug)] -pub enum NodeOutcome { - /// The node completed successfully. - Success, - /// The node failed with an error. - Failed { message: String }, -} - -/// Identifies the kind of plan node for diagnostics purposes. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] -#[serde(rename_all = "camelCase")] -pub enum NodeType { - /// A Request node that executed an Cosmos request via execute_single_operation. - Request, - /// A SequentialDrain node that sequentially processes partitions. - SequentialDrain, - // Future: UnorderedMerge, OrderedMerge, Aggregate, etc. -} -``` - -### 8.4 JSON Representation - -Diagnostics serialize to a hierarchical JSON structure with consistent camelCase property -names: - -```json -{ - "activityId": "e4b2c1d8-...", - "totalDurationMs": 42, - "totalRequestCharge": 5.23, - "requestCount": 1, - "operationPlan": { - "nodeType": "sequentialDrain", - "startedAt": 0, - "completedAt": 42, - "durationMs": 42, - "requestCharge": 5.23, - "outcome": "success", - "requests": [], - "children": [ - { - "nodeType": "request", - "epkRange": { "min": "00", "max": "55" }, - "startedAt": 0, - "completedAt": 15, - "durationMs": 15, - "requestCharge": 5.23, - "outcome": "success", - "requests": [ - { - "executionContext": "initial", - "pipelineType": "dataPlane", - "transportSecurity": "secure", - "transportKind": "gateway", - "transportHttpVersion": "http2", - "region": "westus2", - "endpoint": "https://myaccount.documents.azure.com/", - "status": "200", - "requestCharge": 5.23, - "activityId": "e4b2c1d8-...", - "serverDurationMs": 3.2, - "durationMs": 15, - "events": [ - { "eventType": "transportStart", "durationMs": null }, - { "eventType": "responseHeadersReceived", "durationMs": 12 }, - { "eventType": "transportComplete", "durationMs": 15 } - ], - "timedOut": false, - "requestSent": "sent", - "error": null - } - ], - "children": [] - } - ] - } -} -``` - -For point operations, the structure is similar but with a single Request node and no wrapping -SequentialDrain: - -```json -{ - "activityId": "a1b2c3d4-...", - "totalDurationMs": 8, - "totalRequestCharge": 1.0, - "requestCount": 1, - "operationPlan": { - "nodeType": "request", - "epkRange": null, - "durationMs": 8, - "requestCharge": 1.0, - "outcome": "success", - "requests": [{ "..." : "..." }], - "children": [] - } -} -``` - -### 8.5 Alignment with Existing `DiagnosticsContext` - -The existing `DiagnosticsContext` type (in `diagnostics_context.rs`) currently uses a flat -`requests: Arc>` structure. The feed operations change adds the -hierarchical `operationPlan` field while preserving backward compatibility: - -```rust -impl DiagnosticsContext { - /// Returns the plan diagnostics for this operation. - pub fn operation_plan(&self) -> &NodeDiagnostics { ... } - - /// Returns all Cosmos request diagnostics, flattened across nodes. - /// - /// This is backward-compatible with the pre-feed-operations API. - /// Requests are returned in the order they were executed. - pub fn requests(&self) -> Arc> { - // Flatten: recursively collect requests from the node tree. - } -} -``` - -The `DiagnosticsContextBuilder` gains node-tracking methods: - -```rust -impl DiagnosticsContextBuilder { - /// Records that a node has started executing. - pub(crate) fn start_node( - &mut self, - node_type: NodeType, - feed_range: Option, - ) -> NodeHandle { ... } - - /// Records that a node has completed, with its requests and children. - pub(crate) fn complete_node( - &mut self, - handle: NodeHandle, - requests: Vec, - children: Vec, - outcome: NodeOutcome, - ) { ... } -} -``` - -### 8.6 Verbosity Control - -The existing `DiagnosticsVerbosity` enum (Summary / Detailed) controls serialization: - -| Verbosity | Behavior | -|-----------|----------| -| **Summary** | Node-level timing included. Individual `RequestDiagnostics` are deduplicated/aggregated as they are today. | -| **Detailed** | Full tree: all node timestamps, all individual `RequestDiagnostics` with events, all children. | - -Point operations produce the same output as today at both verbosity levels — the hierarchy -is transparent when there's only one node. - -### 8.7 Pagination Context - -Each `execute_operation` call produces one `DiagnosticsContext`. The SDK layer manages -pagination and can: - -1. **Aggregate across pages** — collect diagnostics from multiple pages to produce a - summary of the full feed operation (total RU, total duration, pages fetched). - -2. **Create OTEL spans** — the SDK can create a parent span for the feed operation, - child spans for each page, and nested spans for each node, using the timestamps - and metadata from the diagnostics tree. The driver does not prescribe span structure — - it provides the data. - ---- - -## 9. Error Handling & Partition Splits & Merges - -### 9.1 Partition Split During Execution - -Request nodes target **EPK ranges**, not PK range IDs. When a Request node receives a 410/1002 -(Gone — PartitionKeyRangeGone) response, the Request node handles the split **internally**: - -1. **Invalidate** the `PartitionKeyRangeCache` for the affected container. -2. **Re-fetch** the partition key ranges. -3. **Re-resolve** the Request node's EPK range to the new child PK range IDs. -4. **Internally split** — the single Request range issues requests to the appropriate - child PK ranges. -5. **Resume execution** with the child range result. - -The plan structure remains stable across splits. The Request node absorbs the split -internally without changing the plan graph. The next time the plan is generated (on the -next page), the Planner will see the new split ranges from the PK range cache and create -separate Request nodes for each child range — the continuation token's EPK bounds guide -the resume position correctly. - -The continuation token survives because it stores EPK bounds (not PK range IDs), and the -Planner re-resolves those bounds to current PK range IDs on each page. - -### 9.2 Partition Merge During Execution - -Cosmos DB may also **merge** adjacent partitions to consolidate underutilized capacity. -After a merge, multiple original PK ranges become one larger PK range. The Request node's -EPK bounds may now fall entirely **inside** a larger merged PK range — the EPK range did -not change, but its owning PK range did. - -Merge handling: - -1. **Cache miss / 410** — A Request may detect the merge via either a stale-cache PK range - ID (the old PK range no longer exists) or via a 410/1002 response. The handling mirrors - the split path: invalidate the cache, re-fetch ranges, re-resolve EPK bounds. -2. **EPK bounds preserved on the wire** — When the Request issues requests against the - merged PK range, it MUST include `x-ms-documentdb-epk-min` and `x-ms-documentdb-epk-max` - headers set to its original EPK bounds. This ensures the server returns only items - inside the Request's intended slice of the merged range, not the entire merged range. -3. **Continuation token survival** — The continuation token's EPK bounds remain valid. - On the next page, the Planner sees the merged PK range and may produce a single - Request node spanning what was previously multiple ranges. The token's EPK bounds - correctly identify the cursor position inside the merged range. - -The plan structure changes across pages (fewer Request nodes after a merge), but the -continuation token's semantics are unchanged: it identifies a slice of the EPK space -that has been drained, regardless of how that slice maps to PK ranges. - -### 9.3 Error Propagation - -| Error Scenario | Behavior | -|----------------|----------| -| 410/1002 (PartitionKeyRangeGone) — split | Request node internally re-resolves EPK range, retries against child PK ranges. | -| 410/1002 (PartitionKeyRangeGone) — merge | Request node internally re-resolves EPK range, retries against the merged PK range with EPK min/max headers. | -| 429 (Throttled) | Handled by transport pipeline (backoff + retry). | -| 503 (Service Unavailable) | Handled by operation pipeline (region failover). | -| 404 (Not Found) — container | Fail the entire feed operation. | -| Transient network error | Handled by transport pipeline (retry). | -| Invalid continuation token | Fail with `ErrorKind::DataConversion`. | - ---- - -## 10. API Semantics & Invariants - -### 10.1 Public API - -The driver exposes a single `execute_operation` method for **all** operations — both point -and feed. The driver is stateless across calls: each invocation runs one page of the plan -and returns a `CosmosResponse`. The response optionally includes a continuation token when -more pages are available. - -```rust -impl CosmosDriver { - /// Executes a Cosmos DB operation (point or feed). - /// - /// For point operations (read, create, delete, etc.), this returns the - /// single response with no continuation token. - /// - /// For feed operations (queries), this executes one page of the plan - /// and returns the result. If more pages are available, the response - /// includes a `ContinuationToken`. The caller passes this token back - /// in `OperationOptions` to fetch the next page. - pub async fn execute_operation( - &self, - operation: CosmosOperation, - options: OperationOptions, - ) -> azure_core::Result { - // Plan → Execute one page → return CosmosResponse - } -} -``` - -### 10.2 CosmosResponse Changes - -`CosmosResponse` gains an optional continuation token, and its `body` field becomes a -`ResponseBody` enum to support both unparsed response bodies (point operations and -single-page feeds the driver passes through verbatim) and pre-parsed item lists -(feeds the driver had to aggregate or whose envelopes it had to crack open), -without forcing every caller to parse a feed envelope: - -```rust -/// The body of a Cosmos DB response. -/// -/// Mirrors `OperationPayload` on the request side: each variant carries -/// exactly the data shape expected for its kind of operation, and the -/// driver does not deserialize item content. -#[non_exhaustive] -pub enum ResponseBody { - /// No body (e.g., 204 No Content). - None, - - /// A response body the driver did not need to parse — raw serialized bytes. - /// Used for any operation where the driver passes the server response through - /// verbatim. Depending on the operation, the caller (the SDK) knows whether - /// these bytes represent a single item (point reads, create/upsert/replace, - /// resource reads like database/container) or a page of feed data (feed - /// operations whose envelope the driver did not need to crack open). - Bytes(Vec), - - /// A list of document bodies — one entry per item, each entry being - /// the raw serialized bytes of one item. - /// Used for feed operations (queries, future read-many) when the - /// driver had to aggregate results across partitions or otherwise parse - /// the feed envelope. Exists so the driver does not have to re-serialize - /// the parsed items just to hand them back to the SDK. The driver does - /// not deserialize the items themselves. - Items(Vec>), -} - -#[non_exhaustive] -pub struct CosmosResponse { - /// Response body. Variant depends on operation type. - body: ResponseBody, - - /// Extracted Cosmos-specific headers. - headers: CosmosResponseHeaders, - - /// Operation status including HTTP status code and optional sub-status. - status: CosmosStatus, - - /// Full diagnostics context for this operation. - diagnostics: Arc, - - /// Continuation token for feed operations. - /// Present when more pages are available; absent for point operations - /// and when the feed is fully drained. - continuation_token: Option, -} - -impl CosmosResponse { - /// Returns the response body. - pub fn body(&self) -> &ResponseBody { - &self.body - } - - /// Returns the continuation token, if more pages are available. - /// - /// For point operations, this always returns `None`. - /// For feed operations, `None` means the operation is complete. - pub fn continuation_token(&self) -> Option<&ContinuationToken> { - self.continuation_token.as_ref() - } -} -``` - -The `Items(Vec>)` shape lets the SDK iterate items and apply per-item -deserialization (with per-item error handling) without first parsing the entire -feed envelope itself. - -### 10.3 OperationOptions Changes - -`OperationOptions` gains feed-specific fields: - -```rust -pub struct OperationOptions { - // ... existing fields (retry, timeout, consistency, etc.) ... - - /// Maximum number of items per page (feed operations only). - /// - /// **This is always a hint.** The driver and the server may exceed it in - /// well-defined cases: - /// - /// - The server may return fewer items than requested (e.g., a partition - /// has fewer items than `max_item_count`). - /// - Some operations require returning a logical group of items together, - /// even if that group exceeds `max_item_count`. The most prominent case - /// is the change feed, where all documents sharing the same LSN - /// (logical sequence number) are returned in the same page to preserve - /// atomicity. `SELECT *` queries do not have this constraint today, but the - /// contract is the same: callers MUST treat `max_item_count` as a hint, - /// not a hard cap. - /// - /// If not set, the server default applies. - max_item_count: Option, - - /// Continuation token for resuming a previous feed operation. - /// Pass the token from a previous `CosmosResponse::continuation_token()`. - continuation: Option, -} -``` - -These fields are ignored for point operations. - -### 10.4 Ordering Guarantees - -| Operation | Order Guarantee | -|-----------|-----------------| -| `SELECT *` (single partition) | (PartitionKey, ID) ascending. | -| `SELECT *` (cross-partition) | Within each partition, (PartitionKey, ID) ascending. Across partitions, items are yielded in EPK order (implementation behavior, not a service guarantee). | - -### 10.5 Page Boundaries - -Each `execute_operation` call for a cross-partition `SELECT *` returns exactly one page from -exactly one partition: - -- **Server-side max item count**: The server may return fewer items than requested. -- **Client-side max item count**: Configurable via `OperationOptions::max_item_count`. - This is **always a hint** — the driver may exceed it when an operation requires - returning a logical group of items together (e.g., change feed returns all documents - sharing the same LSN in the same page). Callers MUST NOT treat the value as a hard cap. -- **Server continuation**: A page boundary occurs whenever the server returns a continuation - token. -- **Partition boundary**: When a partition is fully drained (no server continuation), the - current page is returned. The next call starts the next partition. - -Pages never span partition boundaries. - -### 10.6 SDK Option Plumbing - -The driver-level `OperationOptions` type is **not** what the SDK exposes to user code -verbatim. Each public SDK method has its own options struct whose fields are a curated -subset (and occasionally a superset) of the driver's `OperationOptions` and the -operation-specific knobs. - -`OperationTarget` is the one driver type that the SDK **re-exports verbatim** (the same -way the SDK already re-exports `PartitionKey`, `FeedRange`, consistency-level enums, etc. -from the driver). Because the variants are mutually exclusive at the type level, the SDK -takes a single `target` argument on its feed-operation methods rather than a pair of -optional `partition_key` / `feed_range` fields. This pushes the "exactly one of" -invariant down to the type system and removes a runtime validation step. - -#### Constructors for `OperationTarget` - -`OperationTarget` exposes named factories rather than `From` impls. Callers explicitly -say which targeting mode they want, but they do not have to wrap their argument in the -enum variant by hand: - -```rust -impl OperationTarget { - /// Targets a single logical partition key. - pub fn partition(pk: impl Into) -> Self { - OperationTarget::PartitionKey(pk.into()) - } - - /// Targets a specific feed range (one physical partition, several adjacent ones, - /// etc.). Use `OperationTarget::all_ranges()` for the whole container. - pub fn feed_range(fr: impl Into) -> Self { - OperationTarget::FeedRange(fr.into()) - } - - /// The full key space: targets all partition key ranges. - pub fn all_ranges() -> Self { - OperationTarget::FeedRange(FeedRange::all_ranges()) - } -} -``` - -There are deliberately no `From` / `From` impls — picking -between a single-partition and a feed-range query is a real decision and should be -visible at the call site, not silently inferred from a parameter type. - -#### Method signature change for `query_items` - -Today `query_items` takes a `partition_key: impl Into` (or -similar) parameter. As part of this work it changes to take a single -`target: OperationTarget` parameter. The old `partition_key` parameter is removed -entirely — there is no compatibility shim, since this lands together with the broader -feed-operations refactor: - -```rust -impl ContainerClient { - pub async fn query_items( - &self, - query: impl Into, - parameters: impl IntoIterator, - target: OperationTarget, - options: Option, - ) -> azure_core::Result>>; -} -``` - -Call-site shapes: - -```rust -// Single logical partition (was: .partition_key(pk)) -container.query_items(sql, [], OperationTarget::partition(pk), None).await?; - -// Specific FeedRange (e.g., from a previous split-aware iteration) -container.query_items(sql, [], OperationTarget::feed_range(fr), None).await?; - -// Whole container — the most common case -container.query_items(sql, [], OperationTarget::all_ranges(), None).await?; -``` - -`target` is a required positional argument rather than an option on `QueryOptions`, -because every query has to declare its scope and we want that decision visible at the -call site. Callers that want "the whole container" pass `OperationTarget::all_ranges()` -explicitly. `QueryOptions` therefore does **not** carry a `target` field: - -```rust -// Re-exported from the driver. -pub use azure_data_cosmos_driver::OperationTarget; - -#[derive(Clone, Debug, Default)] -#[non_exhaustive] -pub struct QueryOptions { - /// Page-size hint forwarded as `x-ms-max-item-count`. - pub max_item_count: Option, - - /// Continuation token from a prior `FeedPage::continuation_token()`. - pub continuation: Option, - - /// Standard cross-cutting knobs (consistency, session, retry, timeout, etc.). - pub consistency_level: Option, - pub session_token: Option, - // ... -} -``` - -The flow, for queries specifically, is: - -```text -user code - │ query, parameters, target: OperationTarget, - │ QueryOptions { max_item_count?, continuation?, - │ consistency_level?, session_token?, ... } - ▼ -ContainerClient::query_items(query, parameters, target, options) - │ builds: - │ op = CosmosOperation::query(container, query, parameters) - │ .with_target(target) - │ opts = OperationOptions { - │ max_item_count: options.max_item_count, - │ continuation: options.continuation, - │ consistency_level, session_token, retry, timeout, ... - │ } - ▼ -CosmosDriver::execute_operation(op, opts) - │ Planner → OperationPlan - │ PlanExecutor walks the plan, per fetch builds an OperationOverrides - │ and calls execute_single_operation(&op, &opts, &overrides, ...) - ▼ -CosmosResponse { body, headers, status, diagnostics, continuation_token } - │ SDK converts ResponseBody::Items into FeedPage via T: DeserializeOwned - ▼ -user code -``` - -Two consequences of this layering: - -1. **Mutual exclusivity is a property of the type, not a runtime check.** A - `target: OperationTarget` parameter cannot express "both a partition key and a feed - range" — the user picks one constructor (`OperationTarget::partition(pk)`, - `OperationTarget::feed_range(fr)`, or `OperationTarget::all_ranges()`). The SDK does - not need a runtime guard, and the driver-side `OperationTarget` enum carries the - same guarantee all the way down. - -2. **Options that the driver ignores for a given operation are still allowed at the SDK - layer.** For example, `max_item_count` on a point-read SDK call is silently dropped by - the driver (point ops produce one response). This keeps the SDK option structs ergonomic - and consistent, with the driver as the single point that decides which knobs are - meaningful for which operation. - -#### Per-operation SDK option structs (sketch) - -| SDK method | Options struct | Notable fields it injects into `CosmosOperation` / `OperationOptions` | -|------------|----------------|----------------------------------------------------------------------| -| `read_item` | `ReadItemOptions` | consistency, session, retry → `OperationOptions` | -| `create_item` / `upsert_item` / `replace_item` | `ItemOptions` | indexing directive, pre/post triggers → request headers; consistency → options | -| `delete_item` | `DeleteItemOptions` | consistency → options | -| `query_items` | `QueryOptions` (+ required `target: OperationTarget` positional arg) | `target` → `OperationTarget`; `max_item_count`, `continuation`, consistency → `OperationOptions` | - -The SDK does not expose `OperationOverrides` to user code at all — it is purely an -internal type used by the driver to thread per-fetch state through -`execute_single_operation`. Users control per-fetch behavior indirectly: they set -`max_item_count` once on the SDK options struct, and the driver applies it to every -fetch unless a specific plan node has a reason to override (none today). - ---- - -## 11. Testing Strategy - -### 11.1 Unit Tests - -| Test Area | Cases | -|-----------|-------| -| Planner — point ops | Verify SingleNode plan for each point operation type. | -| Planner — cross-partition `SELECT *` | Verify Graph plan with SequentialDrain root, correct Request children per PK range. | -| Planner — cross-partition `SELECT *` resume | Verify resume skips drained partitions, resumes active, starts right fresh. | -| Planner — bottom-up invariant | Verify children always have lower NodeIds than parents. | -| PlanExecutor — single node | Execute SingleNode plan, verify result matches direct pipeline call. | -| PlanExecutor — drain | Execute SequentialDrain plan with mock pipeline, verify sequential execution. | -| PlanExecutor — drain page boundary | Verify pages don't span partition boundaries. | -| ContinuationToken — serialize | Serialize to base64url string, verify roundtrip. | -| ContinuationToken — deserialize | Deserialize from explicit string, verify result. | -| ContinuationToken — version compat | Older version tokens deserialize correctly. | -| ContinuationToken — future version | Token with version > current is rejected. | -| ContinuationToken — operation kind | Token with wrong operation kind is rejected. | -| ContinuationToken — split recovery | Token with EPK bounds spanning a split range maps to correct child ranges. | -| ContinuationToken — SequentialDrain resume | SequentialDrain node correctly classifies partitions as left/target/right. | -| ContinuationToken — nesting | Nested tokens serialize to the expected exact string and parse back from a fixed input (no round-trip tests; see §13.0). | -| ContinuationToken — unknown variant | Unknown `ResumeState` type fails gracefully on deserialize. | -| NodeId/NodeRange | Verify range iteration, length, empty checks. | -| OperationTarget — variants | Verify `PartitionKey`, `all_ranges()`, and custom `FeedRange` produce correct targets. | -| OperationTarget — mutual exclusivity | SDK rejects requests that supply both `partition_key` and `feed_range`. | -| OperationOverrides — feed_range / continuation / max_item_count | Verify overrides translate to `x-ms-documentdb-epk-min/max`, `x-ms-continuation`, `x-ms-max-item-count` headers. | -| OperationOverrides — base op reuse | One `Arc` drives multiple fetches with distinct overrides; base op is never cloned. | -| Single-partition query fast-path | `OperationTarget::PartitionKey` + `OperationPayload::Query` produces a SingleNode plan, no PK range cache lookup, no query-plan fetch. | -| Single-partition query — ORDER BY / aggregates | Verify `ORDER BY`, `GROUP BY`, `COUNT(*)` queries are forwarded verbatim and pass through. | -| Diagnostics — hierarchy | Verify recursive node tree structure appears in diagnostics JSON. | -| Diagnostics — children | Verify composite nodes contain child node diagnostics. | -| Diagnostics — backward compat | Verify `requests()` flattening returns all requests from nested nodes. | - -### 11.2 Integration Tests - -| Test Area | Cases | -|-----------|-------| -| `SELECT *` — basic | Read all items from a container, verify all returned in EPK order. | -| `SELECT *` — empty container | `SELECT *` on empty container returns no results, no continuation. | -| `SELECT *` — single partition | All items in one partition, verify SingleNode plan execution. | -| `SELECT *` — multi partition | Items across multiple partitions, verify sequential drain. | -| `SELECT *` — pagination | Verify continuation token threads correctly across pages. | -| `SELECT *` — resume | Get continuation mid-stream, resume from it, verify continued results. | -| `SELECT *` — resume across SDK versions | Serialize token, deserialize with newer SDK, verify resume works. | -| `SELECT *` — partition split | Trigger split during cross-partition `SELECT *`, verify Request node re-resolves and completes. | -| `SELECT *` — large dataset | Read many items, verify all pages and partitions are drained. | -| `SELECT * WHERE` — server-side filter | Verify `WHERE` predicate is applied server-side, only matching items returned. | -| Diagnostics — RU aggregation | Verify total RU charge sums across all pages. | -| Diagnostics — plan structure | Verify diagnostics JSON shows SequentialDrain/Request hierarchy with children. | - -### 11.3 Performance Tests - -| Test Area | Metric | -|-----------|--------| -| Point op overhead | Latency regression < 1% vs. direct `execute_single_operation`. | -| Cross-partition `SELECT *` latency | Sequential partition drain does not introduce unnecessary overhead. | - ---- - -## 12. Future Work - -### 12.1 ReadMany - -ReadMany reads multiple items by (ID, PartitionKey) pairs. It requires grouping items by -PK range, creating concurrent `Request` nodes, and merging results via an `UnorderedMerge` -node. This adds concurrency control (semaphore-based) to the PlanExecutor and a new -`PlanNode::UnorderedMerge` variant. - -### 12.2 Cross-Partition Queries - -Cross-partition queries with `ORDER BY`, `GROUP BY`, aggregates (`COUNT`, `SUM`, …), -`OFFSET / LIMIT`, vector search, or hybrid search require fetching a backend query plan, -creating `Request` nodes per partition, and optionally performing client-side sort / merge -via an `OrderedMerge` node. This adds query-plan fetching callbacks to the Planner and -k-way merge logic to the PlanExecutor. - -The same query-plan path is also planned to back single-partition vector / hybrid queries -(see [§5.2.1](#521-single-partition-query-fast-path)) so the driver can apply score -normalization and other plan-driven hints uniformly. For non-vector single-partition -queries the fast-path remains. - -### 12.3 Change Feed - -The change feed is a specialized feed operation with unique characteristics: start-from -modes, lease-based partition assignment, and incremental/full-fidelity modes. - -Unlike a cross-partition `SELECT *`'s sequential drain (where only the active partition's -state is needed), -change feed requires **per-range continuation tokens**. Each feed range maintains its -own server continuation, and the resume state is a list of per-range tokens: - -```rust -// Future ResumeState variant (illustrative) -#[serde(rename = "changeFeed")] -ChangeFeed(ChangeFeedState), - -#[serde(rename_all = "camelCase")] -struct ChangeFeedState { - /// Per-range continuation tokens. - /// Each entry tracks one feed range's EPK bounds and its - /// server-provided continuation token. - range_tokens: Vec, -} - -#[serde(rename_all = "camelCase")] -struct RangeToken { - epk_min: String, - epk_max: String, - server_token: Option, -} -``` - -This is an example where per-partition state is necessary (the token size is O(N) in -range count), as noted in [§7.1 Design Principles](#71-design-principles). The plan -model reserves extension points in `PlanNode` and `ResumeState` for change feed support. - -### 12.4 Concurrency - -Future operations (ReadMany, cross-partition queries) will require concurrent partition -fetching. The concurrency permit model described in [§5.6](#56-future-extensions) provides -the foundation: a shared semaphore limits the number of concurrent permits, and each plan -node acquires a permit before executing. This will add `UnorderedMerge` / `OrderedMerge` -nodes to the plan model. - -### 12.5 Cached Operation Plans - -For in-process callers that call `execute_operation` in a loop, caching the `OperationPlan` -across pages (invalidating on metadata changes) would avoid re-planning on every page. This -is a performance optimization, not a correctness concern. - -### 12.6 Hedging for Feed Operations - -The existing hedging mechanism (speculative execution in secondary regions) could be extended -to individual plan nodes, allowing feed fetches to hedge independently. - -### 12.7 Dedicated `ReadAllItems` Convenience Operation - -Today the unfiltered "read every document in the container" case is expressed as a -`SELECT * FROM c` query. A future revision may add a dedicated `read_all_items` SDK method -and a corresponding `CosmosOperation::read_all_items(...)` factory backed by the existing -`OperationType::ReadFeed` (point-read-feed) wire path. That path avoids the -`application/query+json` envelope and reads at the gateway as a feed read rather than a -query, which can be cheaper RU-wise on large containers. The plan model and continuation -token format above already accommodate this — only the payload variant and the chosen -wire shape differ — so this is purely additive. - ---- - -## 13. Implementation Plan - -This section is the execution checklist for landing the spec. It is split into **two -PR-sized phases**. Each phase ends with a working, mergeable, end-to-end slice — Phase 1 -unblocks single-partition queries; Phase 2 unblocks cross-partition queries. - -The plan is deliberately mechanical so a follow-up coding agent can execute it without -re-deriving design decisions. Cross-references to the design sections above are inline. - -### 13.0 Conventions - -- All new public types live in `azure_data_cosmos_driver` and are re-exported from - `azure_data_cosmos` only when the SDK layer needs them in its public surface. -- All driver-internal types are `pub(crate)`. -- New code must derive `SafeDebug` (not `Debug`) for any type that may carry user data. -- Every public type needs a doc comment summary + details. -- Each phase ends with: `cargo fmt -p azure_data_cosmos_driver -p azure_data_cosmos`, - `cargo clippy -p azure_data_cosmos_driver -p azure_data_cosmos --all-features`, - `cargo test -p azure_data_cosmos_driver -p azure_data_cosmos --all-features`. All three - must be clean before opening the PR. -- **Integration tests run against the live Cosmos DB Emulator**, not via test-proxy - recordings. Agents should expect the user to assist with starting / pointing at the - emulator and reviewing the resulting test runs. -- **Serialization tests never use round-trip assertions.** For each `Display` / - `Serialize` impl, assert against an exact expected string. For each `FromStr` / - `Deserialize` impl, feed an exact input string and assert the parsed structure. Tests - MAY locally base64-decode the wire format inside the test body so that the on-disk - exemplar can stay as plain JSON. - ---- - -### 13.1 Phase 1 — Single-Node Plans - -**End-state for Phase 1.** A user can write: - -```rust -let pager = container.query_items::( - "SELECT * FROM c WHERE c.region = 'westus'", - OperationTarget::partition("westus"), - None, -)?; -while let Some(page) = pager.next().await { - let page = page?; - // ... use page.items, page.continuation_token() ... -} -``` - -…and every existing point operation (`read_item`, `create_item`, `delete_item`, etc.) -goes through the new Plan → Execute pipeline. Cross-partition / `FeedRange` targets are -type-acceptable but error at planning time pointing to "Phase 2". - -#### 13.1.1 Foundational types in `azure_data_cosmos_driver` - -These can be done in any order but must all land before the planner work. - -1. **Migrate `FeedRange` into the driver and extend it.** §3.2.1. - - Move `sdk/cosmos/azure_data_cosmos/src/feed_range.rs` to - `sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs`. Add to - `models/mod.rs`. Make the type `pub`. - - Add `pub use azure_data_cosmos_driver::models::FeedRange;` to - `azure_data_cosmos/src/lib.rs` (or wherever the existing re-export lives) so - the public API does not change. - - Update the driver's internal callers that previously used the old path. - - **Add `pub fn FeedRange::for_partition_key(pk: impl Into, definition: &PartitionKeyDefinition) -> azure_core::Result`.** - Computes the EPK for the given partition-key value and returns a single-EPK - `FeedRange` whose `min_inclusive == max_exclusive == EPK(pk)`. Returns an error if - the value count does not match the definition (full keys only — Phase 1 does not - attempt to support MultiHash prefix keys here; defer that to a follow-up if - needed). - - **Add `pub fn FeedRange::is_singleton(&self) -> bool`** returning `true` iff the - range bounds collapse to a single EPK (i.e. inclusive lower bound equals the - bound that would otherwise be exclusive — the implementation should treat the - "singleton" representation as a closed-closed range over one EPK or use whatever - internal marker `for_partition_key` produces, as long as the predicate is exact - for those constructed values and `false` for any range covering more than one - EPK). - - **Remove `ContainerClient::feed_range_from_partition_key`** from the SDK — its - functionality is now `FeedRange::for_partition_key` (callers can fetch the - `PartitionKeyDefinition` from the container themselves, or via a thin sync - helper on `ContainerClient` that returns the definition). Update the changelog. - - **Acceptance:** `cargo build -p azure_data_cosmos -p azure_data_cosmos_driver`. - -2. **Add `OperationTarget`.** §3.2. - - New file `models/operation_target.rs`. Public enum with three variants - (`None`, `PartitionKey { key: PartitionKey, feed_range: FeedRange }`, - `FeedRange(FeedRange)`). - - The `PartitionKey` variant carries both the logical key (used for the - `x-ms-documentdb-partitionkey` header on gateway-issued requests) and its - singleton `FeedRange` (the range whose min/max EPK equals `EPK(key)`). - **The EPK headers are NOT used for `OperationTarget::PartitionKey` requests** — - the gateway routes by the logical-PK header. The singleton `FeedRange` exists so - downstream planning, continuation tokens, and merge-recovery can reason about a - PK target uniformly with `FeedRange` targets. - - Implement the named constructors: - - `pub fn partition(key: impl Into, definition: &PartitionKeyDefinition) -> azure_core::Result` — - computes the singleton `FeedRange` via `FeedRange::for_partition_key` and - stores both. Errors propagate from `for_partition_key`. - - `pub fn feed_range(impl Into) -> Self`. - - `pub fn all_ranges() -> Self` — returns `Self::FeedRange(FeedRange::full())`. - - Add accessors: `pub fn partition_key(&self) -> Option<&PartitionKey>`, - `pub fn feed_range(&self) -> Option<&FeedRange>` (returns `Some` for both - `PartitionKey` and `FeedRange` variants). - - Do **not** implement `From` or `From`. §10.6. - - Re-export from `models/mod.rs`. - - **Note on the SDK's `ContainerClient::query_items` ergonomics:** since - `OperationTarget::partition` requires a `PartitionKeyDefinition`, the SDK - provides a thin wrapper `ContainerClient::partition_target(key) -> OperationTarget` - that pulls the definition from `container_connection`. Driver-level callers that - already hold a `PartitionKeyDefinition` (e.g. existing point-op factories) call - `OperationTarget::partition` directly. - -3. **Add `OperationPayload`.** §3.1. - - New enum with `None`, `Body(Vec)`, `Query { query, parameters }`. - - Define the Phase-1 `QueryParameter` shape: `{ name: String, value: serde_json::Value }` - unless an equivalent already exists; if one does, reuse it. - - Add a `pub fn body(&self) -> Option<&[u8]>` for transport-layer convenience. - -4. **Add `OperationOverrides`.** §6.3. - - Fields exactly: `feed_range: Option`, `continuation: Option`, - `max_item_count: Option`. `Default`-derive friendly. - - Document the allow/deny list inline. - -5. **Add `ContinuationToken` (Request variant only).** §7. - - `pub struct ContinuationToken { inner: ContinuationTokenInner }`. - - Internal `ContinuationTokenInner { version, container_rid, operation_kind, resume }`. - - `enum ResumeState { Request(RequestState) }` only — `SequentialDrain` is added in - Phase 2. - - `RequestState` carries `server_token: String` plus a `target` discriminator that - captures the original `OperationTarget`: - - For `OperationTarget::PartitionKey { key, .. }`: store the **logical partition - key value** (serialized via `PartitionKey`'s existing wire form). On resume, - the planner reconstructs the singleton `FeedRange` by re-running - `FeedRange::for_partition_key` with the container's current - `PartitionKeyDefinition`. - - For `OperationTarget::FeedRange(_)`: store `epk_min` / `epk_max`. - - For `OperationTarget::None`: store nothing extra (control-plane resumes are - rare but the variant exists for symmetry). - - Implement `Display` (base64url-encoded JSON, no padding) and `FromStr` (decode + - version check). Tests assert the exact JSON exemplar — see §13.1.7. - - Const `CURRENT_TOKEN_VERSION: u32 = 1`. - - **Version preservation rule** (§7.1): emit output tokens at the same version as the - input token; emit `CURRENT_TOKEN_VERSION` only when there is no input token. - This only applies if the incoming version is KNOWN to the Driver, so nothing is - actually needed here since there is only one version and the Driver already - emits it. - -6. **Add `ResponseBody` and update `CosmosResponse`.** §10.2. - - Variants `None`, `Bytes(Vec)`, `Items(Vec>)`. - - Replace `CosmosResponse::body() -> &[u8]` with `body() -> &ResponseBody` (and - remove `into_body`). Update every caller. Convenience `as_bytes(&self) -> Option<&[u8]>` - is acceptable to ease migration. - - Add `continuation_token: Option` field + `continuation_token()` - accessor. - -#### 13.1.2 `CosmosOperation` refactor - -§3. - -7. **Field swap.** Replace `body: Option>` with `payload: OperationPayload`; - replace `partition_key: Option` with `target: OperationTarget`. - Keep `with_body(Vec)` as sugar for `with_payload(OperationPayload::Body(...))`. - Add `with_payload(OperationPayload)` and `with_target(OperationTarget)`. - -8. **Update all existing factory methods** in `models/cosmos_operation.rs` to populate - `target` and `payload` correctly (every `read_item`, `create_item`, `delete_item`, - `batch`, `query_items`, `read_all_databases`, etc.). Point ops use - `OperationTarget::partition(pk)`; account/database ops use `OperationTarget::None`; - existing `query_items` factory keeps `OperationTarget::partition(pk)` for now. - -9. **Add `CosmosOperation::query(container, query, parameters)`.** §3.3. Defaults to - `OperationTarget::all_ranges()` so callers that target a single PK must call - `.with_target(OperationTarget::partition(pk))`. - -#### 13.1.3 Pipeline rename + overrides plumbing - -10. **Rename `execute_operation_pipeline` → `execute_single_operation`.** §2 / §6.3. - - New signature: - `async fn execute_single_operation(&self, operation: &CosmosOperation, options: &OperationOptions, overrides: &OperationOverrides, diagnostics: &mut DiagnosticsContextBuilder) -> Result`. - - Call sites within the driver pass a default `OperationOverrides` for now (point - ops don't use it). - -11. **Apply overrides in the request builder.** - - `overrides.continuation` → `x-ms-continuation` header. - - `overrides.max_item_count` → `x-ms-max-item-count` header (falls back to - `options.max_item_count` if unset). - - `overrides.feed_range` → `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` - headers, **only when the operation's `target` is NOT `OperationTarget::PartitionKey`**. - Gateway-routed PK queries use the logical-PK header instead and rejecting both - together is the simplest correctness rule. The planner is responsible for never - setting `overrides.feed_range` on a PK-targeted node, but the request builder - enforces the invariant defensively (debug-assert in the builder). - Phase 1 never sets EPK headers from the planner, but the wiring must be in - place for Phase 2 — add a unit test that sets `overrides.feed_range` on a - `OperationTarget::None` operation to lock the behavior. - -12. **Translate `OperationPayload` to wire body.** - - `None` → no body. - - `Body(b)` → bytes verbatim, `Content-Type: application/json`. - - `Query { query, parameters }` → JSON envelope `{"query":..., "parameters":[...]}`, - `Content-Type: application/query+json`. Set `x-ms-documentdb-isquery: True` and - `x-ms-documentdb-query-iscontinuationexpected: True` for the cross-partition path - later, but for Phase 1 single-partition queries set `IsContinuationExpected: True` - and **omit** `x-ms-documentdb-query-enablecrosspartition`. - -13. **Extract response continuation.** When `x-ms-continuation` is present on the - response, set `CosmosResponse.continuation_token` to a `ContinuationToken` whose - `ResumeState::Request` carries: - - the `server_token` from the header, - - the `target` discriminator captured from the originating `CosmosOperation`: - - `OperationTarget::PartitionKey { key, .. }` → store the logical PK value - (NOT the EPK bounds — those are reconstructed from the definition on resume), - - `OperationTarget::FeedRange(fr)` → store `fr`'s `min_inclusive` / `max_exclusive`, - - `OperationTarget::None` → no extra fields. - - The `container_rid` and `operation_kind` come from the `CosmosOperation` / - `OperationType`. Use `"query"` for `OperationType::Query`, the operation type - name for everything else. - -14. **`ResponseBody` variant selection.** - - For `OperationPayload::Query`, parse the response envelope and emit - `ResponseBody::Items(Vec>)`. The driver does **not** deserialize items — - it slices the `Documents` array into a `Vec>` of raw JSON values. (Use - `serde_json::value::RawValue` or equivalent to avoid double-parse.) - - For all other operations, emit `ResponseBody::Bytes(...)` for non-empty bodies and - `ResponseBody::None` for 204 / empty bodies. - -#### 13.1.4 Plan model + planner + executor (single-node only) - -§4–§6. - -15. **Plan types — minimal shape.** - - `pub(crate) enum OperationPlan { SingleNode(PlanNode) }` — no `Graph` variant yet - (defer to Phase 2 with a `// TODO(phase-2): Graph variant`). - - `pub(crate) enum PlanNode { Request { operation: Arc, options: OperationOptions, feed_range: Option, continuation: Option } }` — - no `SequentialDrain` yet. - - `NodeId` / `NodeRange` are NOT needed in Phase 1; add them in Phase 2. - -16. **`Planner::plan(operation, options, continuation)`.** §5. - - Always returns `Ok(OperationPlan::SingleNode(...))` in Phase 1. - - Acceptance rules: - - Any `OperationType` other than `Query` → SingleNode regardless of target. (Point - ops, batch, control-plane ops.) - - `OperationType::Query` with `OperationTarget::PartitionKey(_)` → SingleNode - fast-path (no query plan fetch). §5.2.1. - - `OperationType::Query` with `OperationTarget::FeedRange(_)` / - `OperationTarget::all_ranges()` → return - `Err(azure_core::Error::with_message(ErrorKind::Other, "cross-partition queries are not yet supported (planned for Phase 2)"))`. - Phase 2 lifts this. - - When a continuation token is present, validate: - - `version <= CURRENT_TOKEN_VERSION` (already checked in `FromStr`). - - `container_rid` matches `operation.target()`'s container RID. - - `operation_kind` matches. - - `ResumeState` is `Request` (Phase 1 only knows that variant). Otherwise - `ErrorKind::DataConversion`. - - The token's stored `target` discriminator matches `operation.target()`'s - variant. For a `PartitionKey` token, the stored PK value must equal - `operation.target()`'s PK value (otherwise reject with - `ErrorKind::DataConversion`). For a `FeedRange` token the stored EPK bounds - must equal the operation's `FeedRange`. - - Seed `PlanNode::Request.continuation` with `RequestState.server_token`. - -17. **`PlanExecutor::execute(plan, driver_context, diagnostics)`.** §6. - - Match `OperationPlan::SingleNode(node)` and dispatch to a private - `execute_request_node`. - - `execute_request_node` builds an `OperationOverrides` from the node's - `feed_range` + `continuation` + `options.max_item_count` and calls - `execute_single_operation`. Returns the `CosmosResponse` straight through. - -18. **`CosmosDriver::execute_operation` rewrite.** §10.1. - - Sequence: plan → execute one page → return the `CosmosResponse`. - - Continuation token already lives on the response from step 13. - - Existing point operations now go through this path. Verify with the existing - point-op integration test suite — must pass with no test changes. - -19. **Send-future invariant.** The existing - `_assert_execute_operation_future_is_send` compile-time assertion must continue to - hold after the rewrite. Do not introduce non-`Send` types into the plan/executor. - -#### 13.1.5 SDK surface (`azure_data_cosmos`) - -§10.6. - -20. **Re-export `OperationTarget`.** Add `pub use azure_data_cosmos_driver::models::OperationTarget;` - next to the existing `FeedRange` re-export. - -21. **`QueryOptions` change.** Add `pub continuation: Option` and - `pub max_item_count: Option`. Add chained setters - `with_continuation(...)`, `with_max_item_count(...)`. Keep `session_token` and - `operation` fields unchanged. - -22. **`query_items` signature change.** Replace - `partition_key: impl Into` with `target: OperationTarget`. Update doc - comments and example snippets to use `OperationTarget::partition(...)`. - - In Phase 1, calling with `OperationTarget::feed_range(_)` / - `OperationTarget::all_ranges()` returns the planner error from step 16. The SDK - does not need its own validation — it surfaces the driver error. Add a doc note - that cross-partition support arrives in Phase 2. - -23. **Pager-style return value.** The existing `query_items` returns - `FeedItemIterator` built by `QueryExecutor::into_stream()`. Replace its - implementation so each underlying page is produced by - `driver.execute_operation(op, opts)`: - - Build the `CosmosOperation::query(...)` once with the user's `target`. - - Loop: feed `options.continuation` into `OperationOptions`, await - `execute_operation`, emit a `FeedPage` from `ResponseBody::Items`, set the - next-iteration continuation from the response, stop when no continuation is - returned. - - `FeedPage::continuation_token()` exposes the `ContinuationToken` so callers can - pause / resume across process boundaries. - - Item deserialization happens here in the SDK (`T: DeserializeOwned`), not in the - driver. - -24. **Update other call sites.** Anything in the SDK that builds a - `CosmosOperation::query_items(...)` factory call must be updated for the new field - layout. Should be confined to `container_client.rs`. - -#### 13.1.6 Diagnostics (deferred) - -Hierarchical `NodeDiagnostics` (§8) are **deferred to Phase 2 or later**. Phase 1 keeps -the existing flat `RequestDiagnostics` list. Document this in the PR description. - -#### 13.1.7 Tests - -> **Note:** No round-trip tests. Each serialization test pins an exact expected -> string; each deserialization test feeds an exact input. Tests MAY base64-decode the -> wire form locally so the JSON exemplar in the test is human-readable. - -25. **Driver unit tests.** - - `OperationTarget::partition(...)` populates both the logical PK and the - singleton `FeedRange` produced by `FeedRange::for_partition_key(...)`. - - `OperationTarget::feed_range(...)` and `all_ranges()` produce the expected - variants. - - `FeedRange::for_partition_key`: for a known PK value + definition, assert - `min_inclusive == max_exclusive == ` (use a hand-computed - EPK fixture). - - `FeedRange::is_singleton`: `true` for any `FeedRange::for_partition_key(...)` - output; `false` for `FeedRange::full()` and a multi-EPK fixture. - - `OperationPayload::Query` envelope serializes to the exact string - `{"query":"SELECT * FROM c","parameters":[]}` (and a parametrized variant with - the exact expected JSON, asserting field order / casing). - - `ContinuationToken` serialization: build a token with known fields, assert - `token.to_string()` equals an exact base64url string. Provide the JSON - exemplar inside the test body and base64url-encode it locally to derive the - expected string. - - `ContinuationToken` deserialization: feed a known base64url input, assert the - parsed `ContinuationTokenInner` field-by-field. - - `ContinuationToken` for a `PartitionKey`-target token preserves the original - logical PK value across decode (assert the PK value equals the input). - - `ContinuationToken` parse rejects: version > current; bad base64; bad JSON; - missing required fields. One assertion per failure mode. - - `Planner` returns `SingleNode` for every point op type (table-driven test). - - `Planner` returns `SingleNode` for `Query` + `PartitionKey`. - - `Planner` returns the Phase-2 error for `Query` + `FeedRange` / - `Query` + `all_ranges()`, asserting the exact error message. - - `Planner` rejects a continuation token whose stored PK value differs from the - operation's target PK (`ErrorKind::DataConversion`). - - `OperationOverrides` → request headers: lock-in test that asserts - `x-ms-continuation` and `x-ms-max-item-count` appear when the corresponding - override is set; assert that EPK headers DO NOT appear when the operation's - target is `OperationTarget::PartitionKey`; assert that EPK headers DO appear - when the target is `OperationTarget::None` or `OperationTarget::FeedRange(_)`. - - `ResponseBody::Items` parser: feed a fixture body, assert items extracted as - raw bytes without re-encoding (assert the `Vec>` byte-for-byte against - expected slices). - -26. **Driver integration tests (Cosmos DB Emulator).** - - All existing point-op tests pass unchanged. - - New: single-partition query against an emulator container - (`SELECT * FROM c`, `SELECT * FROM c WHERE c.id = @id`), pagination across - multiple pages by setting a small `max_item_count`. Tests provision their - own container, seed deterministic data, and assert exact item sets. - **Do not introduce any test-proxy recordings** — the existing point-op - recordings stay as-is, but new feed-operation tests run live against the - emulator only. - -27. **SDK integration tests (Cosmos DB Emulator).** - - `query_items` with `OperationTarget::partition(pk)` returns the expected - items. - - `query_items` with a `WHERE` clause filters server-side. - - Pagination: drain a multi-page result, then resume from a captured - continuation token mid-stream and verify the second half matches. - - Continuation token from a `partition`-target query, when handed back to a - fresh `query_items` call with the same logical PK, resumes correctly. When - handed to a different PK, it is rejected. - ---- - -### 13.2 Phase 2 — Sequential Drain & Multi-Node Plans - -**End-state for Phase 2.** A user can write: - -```rust -let pager = container.query_items::( - "SELECT * FROM c WHERE c.year = @y", - [Parameter::new("@y", 2026)], - OperationTarget::all_ranges(), // or feed_range(fr) - None, -)?; -``` - -…and the driver plans a `SequentialDrain` over every PK range that intersects the -target, draining them one at a time, paginating across calls, surviving partition -splits and merges via EPK headers, and producing a continuation token after each page. - -The Phase-2 PR is purely additive on top of Phase 1: the Phase-1 SingleNode fast-path -remains the path for `OperationTarget::PartitionKey` queries. - -#### 13.2.1 Plan model expansion - -§4. - -1. **Add `NodeId` and `NodeRange`.** `pub(crate)`, `Copy`, with the `len`, `is_empty`, - `iter` helpers. - -2. **Extend `OperationPlan` with `Graph { nodes: Vec, root: NodeId }`.** - Remove the Phase-1 `// TODO(phase-2)` comment. - -3. **Add `PlanNode::SequentialDrain { children: NodeRange }`.** Document the bottom-up - invariant inline. - -4. **Update existing pattern matches.** Anywhere that matched on `OperationPlan` / - `PlanNode` now needs to handle the new variants. The compiler enforces this. - -#### 13.2.2 Backend Query Plan request - -§5.2.1 / §12.2. - -5. **Add `BackendQueryPlan` types.** New module - `driver/query_plan/{mod.rs,backend_plan.rs}`. **There is no `QueryPlanClient`** — - the planner issues query-plan requests directly through `execute_single_operation` - (see step 6). - - Mirror the schema noted in the team's existing memory (camelCase JSON): - `partitionedQueryExecutionInfoVersion`, `queryInfo`, `queryRanges`, - `hybridSearchQueryInfo`. For Phase 2 we only need: `queryInfo.rewrittenQuery` (must - be empty / absent for in-scope queries), `queryInfo.hasNonStreamingOrderBy` (must be - `false`), `queryInfo.aggregates` (must be empty), `queryInfo.groupByExpressions` - (must be empty), `queryInfo.distinctType` (must be `None`), `queryInfo.orderBy` - (must be empty), `queryInfo.dCountInfo` (must be absent), `queryInfo.top` (must be - absent), `queryInfo.offset` / `limit` (must be absent), and `queryRanges`. - -6. **Issue the query-plan request inline from the planner.** Add a helper - `Planner::fetch_backend_plan(operation: &CosmosOperation) -> Result` - that: - - Builds a synthetic `CosmosOperation` whose target is `OperationTarget::None` - and whose payload is the same `OperationPayload::Query` as the user's request. - - Calls `execute_single_operation` directly with an `OperationOverrides` that - sets the query-plan headers: - `x-ms-cosmos-is-query-plan-request: True`, - `x-ms-cosmos-supported-query-features: None`, - `x-ms-cosmos-query-version: 1.0`, - `Content-Type: application/query+json`, - `x-ms-documentdb-query-iscontinuationexpected: False`. - (Extend `OperationOverrides` with a small `extra_headers: Vec<(HeaderName, HeaderValue)>` - field if no cleaner mechanism exists, or — preferred — add a private - `RequestKind::QueryPlan` discriminant on the synthetic operation so the - transport layer applies the right headers without growing the public - override type. Pick the smaller diff.) - - Bypasses planning recursion (the planner calls `execute_single_operation` - directly, not `execute_operation`). - - Parses the response body into `BackendQueryPlan`. - -7. **Classify the plan as passthrough.** - - Helper `BackendQueryPlan::is_passthrough(&self) -> bool` returning `true` iff every - "must be empty/false/absent" check above passes. - - If `false`, the planner returns - `Err(azure_core::Error::with_message(ErrorKind::Other, "this query requires features that are not yet supported by the Rust SDK (cross-partition ORDER BY / GROUP BY / aggregates / vector / hybrid)"))`. - -#### 13.2.3 Planner: cross-partition `SELECT *` - -§5.4. - -8. **Wire `PartitionKeyRangeCache` into the planner.** Pass an `Arc` - on `Planner::new(...)`. Add a `fetch_pk_ranges_for_target` helper that returns the - list of `PartitionKeyRange`s overlapping the target's `FeedRange`. - -9. **Implement cross-partition planning.** - - Phase-1 SingleNode acceptance rules unchanged. - - For `Query` + (`FeedRange(_)` | `all_ranges()`): - 1. Fetch the backend query plan (step 6). Cache by `(container_rid, query_text, - params_hash)` for the lifetime of one `execute_operation` call. (Cross-call - caching is §12.5 future work.) - 2. Verify passthrough (step 7). - 3. Compute the effective EPK bounds: intersect `target.feed_range()` with each PK - range from the cache. Filter out PK ranges that do not overlap. - 4. Apply continuation-token resume: if the input continuation has - `ResumeState::SequentialDrain { epk_min, epk_max, server_token }`, drop ranges - whose `max_epk <= epk_min`; the first remaining range carries `server_token`, - the rest start fresh. §7.3. - 5. Build nodes bottom-up: push N `PlanNode::Request` (each carries - `Arc` shared across siblings, its EPK range, and any seeded - continuation), then push `PlanNode::SequentialDrain { children: NodeRange(0..N) }`. - `root = NodeId(N)`. - -10. **Single-Request degenerate case.** If after intersection N == 1, still emit a - `Graph` plan (do **not** silently downgrade to SingleNode) so the executor can - produce a `ResumeState::SequentialDrain` token consistent across pages. - -#### 13.2.4 Executor: SequentialDrain walk - -§6.1. - -11. **Match `OperationPlan::Graph`** in `PlanExecutor::execute`, look up `root`, dispatch. - -12. **`execute_sequential_drain`.** - - Pick the active child = `children.start` (Phase-2 invariant: planner has already - pruned drained ranges). - - Acquire a single concurrency permit (sequential: a `Semaphore::new(1)` per drain; - the permit machinery is §5.6 future work but we add a minimal stub now so that - future variants slot in cleanly — a `tokio::sync::Semaphore` is fine). - - Execute the active child via `execute_request_node`. - - Build the output `ContinuationToken` (step 14). - - Return the `CosmosResponse` with the new token attached. - -13. **Page-boundary rule.** Each `execute_operation` call returns exactly one page from - one partition. Even if the active child completes (no server continuation) and there - are more children, do NOT proactively start the next one in the same call — the - output continuation simply moves the cursor to the next range. §10.5. - -14. **Output continuation construction.** - - If the executed `Request` returned a server continuation: emit - `ResumeState::SequentialDrain { epk_min, epk_max, server_token: Some(...) }` for - the active range. - - If it did not, and there are more children to the right: emit - `ResumeState::SequentialDrain { epk_min, epk_max, server_token: None }` so the - Planner on the next call skips this range and starts the next. - - If it did not, and the active child was the last one: emit `None` continuation - (operation complete). - -#### 13.2.5 Continuation token expansion - -§7. - -15. **Add `ResumeState::SequentialDrain(SequentialDrainState)`.** Add explicit - serialization tests (assert against an exact base64url string) and explicit - deserialization tests (feed an exact input, assert parsed structure - field-by-field). No round-trip tests. - -16. **Version preservation across resume.** Already handled by Phase 1's "echo the input - version" rule — verify it continues to apply when the input is a `SequentialDrain` - token. - -17. **Reject cross-variant tokens.** A `SequentialDrain` token presented to a Phase-1 - SingleNode operation is rejected (`ErrorKind::DataConversion`). A `Request` token - presented to a cross-partition operation is also rejected. - -#### 13.2.6 Split / merge handling - -§9.1 / §9.2. - -18. **Inside `execute_request_node`, on `Status 410 SubStatus 1002`:** - - Invalidate `PartitionKeyRangeCache` for the container. - - Re-fetch the PK ranges intersecting the node's `feed_range`. - - For each new sub-range, issue a follow-up `execute_single_operation` call with - `OperationOverrides.feed_range` set to the **original** node EPK bounds (so EPK - headers narrow the result to the node's slice even on a merged PK range). - - Concatenate the resulting `ResponseBody::Items` into one page-output. - - The continuation token logic (step 14) still applies to the original EPK bounds, - not the new sub-range bounds; the next planner call will see the new topology. - -19. **Pipeline-level helper.** Make sure the transport layer always emits - `x-ms-documentdb-epk-min` / `x-ms-documentdb-epk-max` whenever - `OperationOverrides.feed_range` is set. (Hooked up in Phase 1 step 11.) - -#### 13.2.7 SDK surface adjustments - -20. **Lift Phase-1 restriction.** The planner now accepts `FeedRange` and `all_ranges()` - targets, so the SDK's `query_items` automatically gains cross-partition support — no - code change required at the SDK boundary beyond updating doc comments and adding - examples. - -21. **Pager loop.** The Phase-1 pager loop in `query_items` works unchanged — it just - drains more pages now. - -22. **`FeedRange::for_partition_key` interaction.** Confirm that - `FeedRange::for_partition_key(pk, &definition)` (added in Phase 1, step 1) - returns a `FeedRange` that, when handed to `OperationTarget::feed_range(...)`, - drives a single-PK-range `SequentialDrain` and returns the same items as - `OperationTarget::partition(pk, &definition)`. Add an emulator integration - test that asserts the two paths produce the same item set. - -#### 13.2.8 Diagnostics (now or split into a follow-up PR) - -23. Decide based on PR size. If hierarchical `NodeDiagnostics` (§8) fits, add it here: - `start_node` / `complete_node` builder methods, recursive collection in the executor, - flat `requests()` accessor for back-compat. If not, file a follow-up issue and ship - Phase 2 with the existing flat diagnostics — this is purely an observability - enhancement and does not affect correctness. - -#### 13.2.9 Tests - -> Same testing rules as §13.1.7: no round-trip serialization tests; integration tests -> run against the Cosmos DB Emulator, not test-proxy recordings. - -24. **Planner unit tests.** - - Cross-partition `SELECT *` against 3 PK ranges produces a Graph with 3 Request - children + 1 SequentialDrain root, bottom-up. - - Resume from `ResumeState::SequentialDrain` skips left-of-cursor ranges, seeds the - active range's continuation, leaves right-of-cursor ranges unseeded. - - Backend query plan rejection: a query with cross-partition `ORDER BY` is rejected - with the documented error message (assert exact string). - - Single-PK degenerate case still emits `Graph`, not `SingleNode`. - -25. **Executor unit tests (with mock pipeline).** - - SequentialDrain processes one page per call; continuation token threads correctly. - - Active-range completion (no server token, more children) emits a token with - `server_token: None` that on the next call skips to the next range. - - Last range exhausted → `continuation_token` is `None`. - -26. **Continuation token tests.** - - `SequentialDrain` serialization: build a token with known fields, assert - `to_string()` equals an exact base64url string (JSON exemplar provided in the - test, base64url-encoded locally). - - `SequentialDrain` deserialization: feed an exact base64url string, assert the - parsed structure field-by-field. - - Cross-variant rejection (Phase 1 `Request` token vs Phase 2 `SequentialDrain` - operation, both directions). - - Version preservation: input v1 token → all output tokens are v1, even if - `CURRENT_TOKEN_VERSION` has bumped (assert by constructing a v1 input token, - executing one page, asserting the output token's version field equals 1). - -27. **Split / merge tests.** - - Unit-level fault injection: inject a 410/1002 response in the mock pipeline; - verify the Request node re-resolves the PK ranges, issues follow-up calls - with EPK headers narrowing to the original node bounds, and the page completes - without surfacing the error to the caller. - - Verify the EPK headers on the post-split sub-requests by inspecting the - mock pipeline's recorded requests. - - Emulator integration: there is no portable way to force a split on the - emulator, so split/merge coverage is unit-level only. Note this in the PR. - -28. **End-to-end SDK integration tests (Cosmos DB Emulator).** - - `OperationTarget::all_ranges()` with `SELECT * FROM c` against a multi-partition - container drains every item exactly once, in `(EPK, RID)` order. - - Same with a `WHERE` clause: only matching items returned. - - `OperationTarget::feed_range(FeedRange::for_partition_key(pk, &def)?)` returns - the same set as `OperationTarget::partition(pk, &def)` for the same logical - partition. - - Mid-stream pause/resume: capture the continuation after page 2, build a fresh - `query_items` call with the same query + the captured token, verify the - remaining pages match. - ---- - -### 13.3 Out of Scope for Both Phases - -- Cross-partition `ORDER BY`, `GROUP BY`, aggregates, vector, hybrid (§12.2). -- ReadMany (§12.1). -- Change feed (§12.3). -- Hierarchical OTEL spans built by the SDK from `NodeDiagnostics` (the SDK owns this, - not the driver; spec'd in §8.7 but not on the implementation critical path). -- Cross-call `OperationPlan` caching (§12.5). -- Hedging on feed nodes (§12.6). -- Dedicated `read_all_items` factory (§12.7). From 3bf13bd018a7cafbaa7026beb4f339eb97ea8584 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 11:49:54 -0700 Subject: [PATCH 13/29] Initial dataflow pipeline --- .../src/driver/cosmos_driver.rs | 72 +++- .../src/driver/dataflow/mod.rs | 180 ++++++++ .../src/driver/dataflow/request.rs | 387 ++++++++++++++++++ .../src/driver/mod.rs | 1 + .../src/driver/pipeline/retry_evaluation.rs | 66 ++- .../src/models/cosmos_headers.rs | 1 + .../src/models/cosmos_status.rs | 13 + 7 files changed, 713 insertions(+), 7 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 5f74982b439..e646a72bb1d 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -7,10 +7,16 @@ use crate::{ diagnostics::{ DiagnosticsContextBuilder, PipelineType, TransportHttpVersion, TransportSecurity, }, - driver::routing::{session_manager::SessionManager, CosmosEndpoint, LocationStateStore}, + driver::{ + dataflow::{ + PartitionRoutingRefresh, Pipeline, PipelineContext, Request, RequestExecutor, + RequestTarget, + }, + routing::{session_manager::SessionManager, CosmosEndpoint, LocationStateStore}, + }, models::{ - AccountEndpoint, AccountReference, ActivityId, ContainerProperties, ContainerReference, - CosmosOperation, DatabaseProperties, DatabaseReference, + request_header_names, AccountEndpoint, AccountReference, ActivityId, ContainerProperties, + ContainerReference, CosmosOperation, CosmosResponse, DatabaseProperties, DatabaseReference, }, options::{ ConnectionPoolOptions, DiagnosticsOptions, DriverOptions, OperationOptions, @@ -18,6 +24,7 @@ use crate::{ }, }; use arc_swap::ArcSwap; +use azure_core::http::headers::{HeaderName, HeaderValue}; use futures::future::BoxFuture; use std::error::Error as _; use std::sync::atomic::{AtomicBool, Ordering}; @@ -34,6 +41,34 @@ use super::{ CosmosDriverRuntime, }; +struct DriverRequestExecutor<'a> { + driver: &'a CosmosDriver, + options: &'a OperationOptions, +} + +impl RequestExecutor for DriverRequestExecutor<'_> { + fn execute_request<'a>( + &'a mut self, + operation: &'a CosmosOperation, + _target: &'a RequestTarget, + _partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option<&'a str>, + ) -> BoxFuture<'a, azure_core::Result> { + let driver = self.driver; + let mut options = self.options.clone(); + if let Some(continuation) = continuation { + let mut custom_headers = options.custom_headers().cloned().unwrap_or_default(); + custom_headers.insert( + HeaderName::from_static(request_header_names::CONTINUATION), + HeaderValue::from(continuation.to_owned()), + ); + options = options.with_custom_headers(custom_headers); + } + + Box::pin(async move { driver.execute_operation_direct(operation, &options).await }) + } +} + /// Cosmos DB driver instance. /// /// A driver represents a connection to a specific Cosmos DB account. It is created @@ -972,8 +1007,35 @@ impl CosmosDriver { } tracing::debug!("operation started"); + let Some(partition_key) = operation.partition_key().cloned() else { + return self.execute_operation_direct(&operation, &options).await; + }; + + let target = RequestTarget::logical_partition_key(partition_key); + let root = Request::new(operation, target); + let mut pipeline = Pipeline::new(Box::new(root)); + let mut executor = DriverRequestExecutor { + driver: self, + options: &options, + }; + let mut context = PipelineContext::new(&mut executor); + + match pipeline.next_page(&mut context).await? { + Some(response) => Ok(response), + None => Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "request dataflow pipeline completed without emitting a response", + )), + } + } + + async fn execute_operation_direct( + &self, + operation: &CosmosOperation, + options: &OperationOptions, + ) -> azure_core::Result { // Step 1: Build the single OperationOptionsView for layered resolution. - let effective_options = self.operation_options_view(&options); + let effective_options = self.operation_options_view(options); // Step 2: Resolve effective throughput control group (if any). let effective_control_group = match operation.container() { @@ -1048,7 +1110,7 @@ impl CosmosDriver { // Step 7: Execute via the new operation pipeline super::pipeline::operation_pipeline::execute_operation_pipeline( - &operation, + operation, &effective_options, options.custom_headers(), self.location_state_store.as_ref(), diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs new file mode 100644 index 00000000000..ee4434cd37f --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -0,0 +1,180 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Dataflow pipeline nodes for paged Cosmos DB operations. + +mod request; + +use futures::future::BoxFuture; + +use crate::models::{CosmosOperation, CosmosResponse}; + +pub(crate) use request::{Request, RequestTarget}; + +/// Request execution mode for partition routing metadata. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum PartitionRoutingRefresh { + /// Use existing partition routing metadata. + UseCached, + /// Force partition routing metadata to be refreshed before executing. + ForceRefresh, +} + +/// Executes leaf request nodes through the existing operation pipeline. +pub(crate) trait RequestExecutor: Send { + /// Executes a single request node. + fn execute_request<'a>( + &'a mut self, + operation: &'a CosmosOperation, + target: &'a RequestTarget, + partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option<&'a str>, + ) -> BoxFuture<'a, azure_core::Result>; +} + +/// Context passed through dataflow node execution. +pub(crate) struct PipelineContext<'a> { + request_executor: &'a mut dyn RequestExecutor, +} + +impl<'a> PipelineContext<'a> { + /// Creates a new pipeline execution context. + pub(crate) fn new(request_executor: &'a mut dyn RequestExecutor) -> Self { + Self { request_executor } + } + + async fn execute_request( + &mut self, + operation: &CosmosOperation, + target: &RequestTarget, + partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option<&str>, + ) -> azure_core::Result { + self.request_executor + .execute_request(operation, target, partition_routing_refresh, continuation) + .await + } +} + +/// A dataflow node that emits pages and may own child nodes. +/// +/// Each `next_page` call boxes a future via `async_trait`; the per-page +/// allocation is negligible compared to the multi-millisecond network I/O +/// of a Cosmos DB request. +#[async_trait::async_trait] +pub(crate) trait PipelineNode: Send { + /// Emits the next page of results, or `None` when this node is drained. + async fn next_page( + &mut self, + context: &mut PipelineContext<'_>, + ) -> azure_core::Result>; + + /// Returns the node's strongly-owned children. + fn children(&self) -> &[Box]; +} + +/// A pipeline root that owns the node tree. +pub(crate) struct Pipeline { + root: Box, +} + +impl Pipeline { + /// Creates a pipeline from an owned root node. + pub(crate) fn new(root: Box) -> Self { + Self { root } + } + + /// Emits the next page from the root node. + pub(crate) async fn next_page( + &mut self, + context: &mut PipelineContext<'_>, + ) -> azure_core::Result> { + self.root.next_page(context).await + } +} + +#[cfg(test)] +mod tests { + use std::{collections::VecDeque, sync::Arc}; + + use futures::future::BoxFuture; + + use super::*; + use crate::{ + diagnostics::DiagnosticsContextBuilder, + models::{ActivityId, CosmosResponseHeaders, CosmosStatus}, + options::DiagnosticsOptions, + }; + + struct MockLeaf { + pages: VecDeque>>, + } + + impl MockLeaf { + fn with_pages(pages: Vec>>) -> Self { + Self { + pages: pages.into(), + } + } + } + + #[async_trait::async_trait] + impl PipelineNode for MockLeaf { + async fn next_page( + &mut self, + _context: &mut PipelineContext<'_>, + ) -> azure_core::Result> { + self.pages.pop_front().expect("mock page result") + } + + fn children(&self) -> &[Box] { + &[] + } + } + + struct NoopRequestExecutor; + + impl RequestExecutor for NoopRequestExecutor { + fn execute_request<'a>( + &'a mut self, + _operation: &'a CosmosOperation, + _target: &'a RequestTarget, + _partition_routing_refresh: PartitionRoutingRefresh, + _continuation: Option<&'a str>, + ) -> BoxFuture<'a, azure_core::Result> { + Box::pin(async { + Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "noop executor should not be called", + )) + }) + } + } + + fn response(body: &[u8]) -> CosmosResponse { + let mut diagnostics = DiagnosticsContextBuilder::new( + ActivityId::new_uuid(), + Arc::new(DiagnosticsOptions::default()), + ); + diagnostics.set_operation_status(azure_core::http::StatusCode::Ok, None); + CosmosResponse::new( + body.to_vec(), + CosmosResponseHeaders::new(), + CosmosStatus::new(azure_core::http::StatusCode::Ok), + Arc::new(diagnostics.complete()), + ) + } + + #[tokio::test] + async fn pipeline_forwards_pages_from_root() { + let mut pipeline = Pipeline::new(Box::new(MockLeaf::with_pages(vec![Ok(Some(response( + b"page", + )))]))); + let mut executor = NoopRequestExecutor; + let mut context = PipelineContext::new(&mut executor); + + let page = pipeline.next_page(&mut context).await.unwrap().unwrap(); + + assert_eq!(page.body(), b"page"); + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs new file mode 100644 index 00000000000..55e8bd216fd --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -0,0 +1,387 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Request leaf node for the dataflow pipeline. + +use std::ops::Range; + +use async_trait::async_trait; +use azure_core::http::StatusCode; + +use crate::models::{ + effective_partition_key::EffectivePartitionKey, CosmosOperation, CosmosResponse, PartitionKey, + SubStatusCode, +}; + +use super::{PartitionRoutingRefresh, PipelineContext, PipelineNode}; + +/// The target of a request node. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum RequestTarget { + /// A single logical partition key. + LogicalPartitionKey(PartitionKey), + + /// An effective partition key range believed to be in one physical partition. + EffectivePartitionKeyRange { + /// EPK range scoped by this request. + range: Range, + /// Partition key range ID believed to contain `range`. + partition_key_range_id: String, + }, +} + +impl RequestTarget { + /// Creates a logical partition key target. + pub(crate) fn logical_partition_key(partition_key: PartitionKey) -> Self { + Self::LogicalPartitionKey(partition_key) + } + + /// Creates an EPK range target believed to be contained by one physical partition. + pub(crate) fn effective_partition_key_range( + range: Range, + partition_key_range_id: impl Into, + ) -> Self { + Self::EffectivePartitionKeyRange { + range, + partition_key_range_id: partition_key_range_id.into(), + } + } +} + +/// Leaf node that executes one Cosmos DB request per page. +pub(crate) struct Request { + operation: CosmosOperation, + target: RequestTarget, + latest_server_continuation: Option, + logical_partition_topology_retry_used: bool, +} + +impl Request { + /// Creates a request node. + pub(crate) fn new(operation: CosmosOperation, target: RequestTarget) -> Self { + Self::with_continuation(operation, target, None) + } + + /// Creates a request node restored with the latest server-issued continuation. + pub(crate) fn with_continuation( + operation: CosmosOperation, + target: RequestTarget, + latest_server_continuation: Option, + ) -> Self { + Self { + operation, + target, + latest_server_continuation, + logical_partition_topology_retry_used: false, + } + } + + /// Returns the operation this request node executes. + pub(crate) fn operation(&self) -> &CosmosOperation { + &self.operation + } + + /// Returns the target this request node uses for routing. + pub(crate) fn target(&self) -> &RequestTarget { + &self.target + } + + /// Returns the latest server-issued continuation for this request's partition. + pub(crate) fn latest_server_continuation(&self) -> Option<&str> { + self.latest_server_continuation.as_deref() + } +} + +#[async_trait] +impl PipelineNode for Request { + async fn next_page( + &mut self, + context: &mut PipelineContext<'_>, + ) -> azure_core::Result> { + match context + .execute_request( + &self.operation, + &self.target, + PartitionRoutingRefresh::UseCached, + self.latest_server_continuation.as_deref(), + ) + .await + { + Ok(response) => Ok(Some(self.record_response_continuation(response))), + Err(error) if is_partition_topology_change(&error) => { + self.handle_partition_topology_change(context, error).await + } + Err(error) => Err(error), + } + } + + fn children(&self) -> &[Box] { + &[] + } +} + +impl Request { + async fn handle_partition_topology_change( + &mut self, + context: &mut PipelineContext<'_>, + error: azure_core::Error, + ) -> azure_core::Result> { + match &self.target { + RequestTarget::LogicalPartitionKey(_) => { + if self.logical_partition_topology_retry_used { + return Err(error); + } + + self.logical_partition_topology_retry_used = true; + context + .execute_request( + &self.operation, + &self.target, + PartitionRoutingRefresh::ForceRefresh, + self.latest_server_continuation.as_deref(), + ) + .await + .map(|response| self.record_response_continuation(response)) + .map(Some) + } + RequestTarget::EffectivePartitionKeyRange { .. } => { + panic!( + "EPK range request encountered a partition topology change; pipeline repair is not implemented" + ); + } + } + } + + fn record_response_continuation(&mut self, response: CosmosResponse) -> CosmosResponse { + self.latest_server_continuation = response.headers().continuation.clone(); + response + } +} + +fn is_partition_topology_change(error: &azure_core::Error) -> bool { + match error.kind() { + azure_core::error::ErrorKind::HttpResponse { + status, error_code, .. + } if *status == StatusCode::Gone => error_code + .as_deref() + .and_then(|code| code.parse::().ok()) + .is_some_and(is_partition_topology_change_substatus), + _ => false, + } +} + +fn is_partition_topology_change_substatus(substatus: u32) -> bool { + matches!( + SubStatusCode::new(substatus), + SubStatusCode::PARTITION_KEY_RANGE_GONE + | SubStatusCode::COMPLETING_SPLIT + | SubStatusCode::COMPLETING_PARTITION_MIGRATION + ) +} + +#[cfg(test)] +mod tests { + use std::{collections::VecDeque, sync::Arc}; + + use azure_core::error::ErrorKind; + use futures::future::BoxFuture; + + use super::*; + use crate::{ + diagnostics::DiagnosticsContextBuilder, + driver::dataflow::RequestExecutor, + models::{ + AccountReference, ActivityId, CosmosResponseHeaders, CosmosStatus, DatabaseReference, + }, + options::DiagnosticsOptions, + }; + + struct MockRequestExecutor { + responses: VecDeque>, + refresh_calls: Vec, + continuation_calls: Vec>, + } + + impl MockRequestExecutor { + fn new(responses: Vec>) -> Self { + Self { + responses: responses.into(), + refresh_calls: Vec::new(), + continuation_calls: Vec::new(), + } + } + } + + impl RequestExecutor for MockRequestExecutor { + fn execute_request<'a>( + &'a mut self, + _operation: &'a CosmosOperation, + _target: &'a RequestTarget, + partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option<&'a str>, + ) -> BoxFuture<'a, azure_core::Result> { + self.refresh_calls.push(partition_routing_refresh); + self.continuation_calls + .push(continuation.map(str::to_owned)); + let response = self.responses.pop_front().expect("mock request response"); + Box::pin(async move { response }) + } + } + + fn operation() -> CosmosOperation { + let account = AccountReference::with_master_key( + url::Url::parse("https://test.documents.azure.com:443/").unwrap(), + "dGVzdA==", + ); + let database = DatabaseReference::from_name(account, "db".to_owned()); + CosmosOperation::read_database(database) + } + + fn logical_partition_target() -> RequestTarget { + RequestTarget::logical_partition_key(PartitionKey::from("pk")) + } + + fn epk_range_target() -> RequestTarget { + RequestTarget::effective_partition_key_range( + EffectivePartitionKey::from("00")..EffectivePartitionKey::from("80"), + "0", + ) + } + + fn response(body: &[u8]) -> CosmosResponse { + response_with_continuation(body, None) + } + + fn response_with_continuation(body: &[u8], continuation: Option<&str>) -> CosmosResponse { + let mut diagnostics = DiagnosticsContextBuilder::new( + ActivityId::new_uuid(), + Arc::new(DiagnosticsOptions::default()), + ); + diagnostics.set_operation_status(StatusCode::Ok, None); + let mut headers = CosmosResponseHeaders::new(); + headers.continuation = continuation.map(str::to_owned); + CosmosResponse::new( + body.to_vec(), + headers, + CosmosStatus::new(StatusCode::Ok), + Arc::new(diagnostics.complete()), + ) + } + + fn gone_error() -> azure_core::Error { + azure_core::Error::new( + ErrorKind::HttpResponse { + status: StatusCode::Gone, + error_code: Some(SubStatusCode::PARTITION_KEY_RANGE_GONE.value().to_string()), + raw_response: None, + }, + "partition topology changed", + ) + } + + fn non_topology_gone_error() -> azure_core::Error { + azure_core::Error::new( + ErrorKind::HttpResponse { + status: StatusCode::Gone, + error_code: Some(SubStatusCode::NAME_CACHE_STALE.value().to_string()), + raw_response: None, + }, + "name cache is stale", + ) + } + + #[tokio::test] + async fn request_retries_logical_partition_key_topology_change_once() { + let mut request = Request::new(operation(), logical_partition_target()); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Ok(response(b"ok"))]); + let mut context = PipelineContext::new(&mut executor); + + let page = request.next_page(&mut context).await.unwrap().unwrap(); + + assert_eq!(page.body(), b"ok"); + assert_eq!( + executor.refresh_calls, + vec![ + PartitionRoutingRefresh::UseCached, + PartitionRoutingRefresh::ForceRefresh + ] + ); + assert_eq!(executor.continuation_calls, vec![None, None]); + } + + #[tokio::test] + async fn request_returns_second_logical_partition_key_topology_change() { + let mut request = Request::new(operation(), logical_partition_target()); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Err(gone_error())]); + let mut context = PipelineContext::new(&mut executor); + + let error = request.next_page(&mut context).await.unwrap_err(); + + assert!(is_partition_topology_change(&error)); + assert_eq!( + executor.refresh_calls, + vec![ + PartitionRoutingRefresh::UseCached, + PartitionRoutingRefresh::ForceRefresh + ] + ); + assert_eq!(executor.continuation_calls, vec![None, None]); + } + + #[tokio::test] + async fn request_does_not_retry_non_topology_gone() { + let mut request = Request::new(operation(), logical_partition_target()); + let mut executor = MockRequestExecutor::new(vec![Err(non_topology_gone_error())]); + let mut context = PipelineContext::new(&mut executor); + + let error = request.next_page(&mut context).await.unwrap_err(); + + assert!(!is_partition_topology_change(&error)); + assert_eq!( + executor.refresh_calls, + vec![PartitionRoutingRefresh::UseCached] + ); + assert_eq!(executor.continuation_calls, vec![None]); + } + + #[tokio::test] + async fn request_tracks_server_continuation_for_next_page() { + let mut request = Request::new(operation(), logical_partition_target()); + let mut executor = MockRequestExecutor::new(vec![ + Ok(response_with_continuation(b"page1", Some("token-1"))), + Ok(response_with_continuation(b"page2", Some("token-2"))), + ]); + let mut context = PipelineContext::new(&mut executor); + + let page1 = request.next_page(&mut context).await.unwrap().unwrap(); + let page2 = request.next_page(&mut context).await.unwrap().unwrap(); + + assert_eq!(page1.body(), b"page1"); + assert_eq!(page2.body(), b"page2"); + assert_eq!( + executor.continuation_calls, + vec![None, Some("token-1".to_string())] + ); + assert_eq!(request.latest_server_continuation(), Some("token-2")); + } + + #[tokio::test] + async fn request_uses_restored_continuation_on_first_page() { + let mut request = Request::with_continuation( + operation(), + logical_partition_target(), + Some("restored-token".to_string()), + ); + let mut executor = MockRequestExecutor::new(vec![Ok(response(b"page"))]); + let mut context = PipelineContext::new(&mut executor); + + let page = request.next_page(&mut context).await.unwrap().unwrap(); + + assert_eq!(page.body(), b"page"); + assert_eq!( + executor.continuation_calls, + vec![Some("restored-token".to_string())] + ); + assert_eq!(request.latest_server_continuation(), None); + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs index 2e7bdf123ab..22fab1801f0 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs @@ -13,6 +13,7 @@ pub(crate) mod cache; mod cosmos_driver; +pub(crate) mod dataflow; pub(crate) mod jitter; pub(crate) mod pipeline; pub(crate) mod routing; diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/retry_evaluation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/retry_evaluation.rs index 8184d0068cb..584a692bcec 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/retry_evaluation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/retry_evaluation.rs @@ -88,10 +88,11 @@ pub(crate) fn evaluate_transport_result( ); } - // 429/3092 (SystemResourceUnavailable) and 410 (Gone). + // 429/3092 (SystemResourceUnavailable) and non-topology 410 (Gone). + // Partition-topology 410s are handled by the dataflow request node. let is_system_resource_unavailable = status.is_throttled() && status.sub_status() == Some(SubStatusCode::SYSTEM_RESOURCE_UNAVAILABLE); - let is_gone = status.is_gone(); + let is_gone = status.is_gone() && !status.is_partition_topology_change(); if (is_system_resource_unavailable || is_gone) && retry_state.can_retry_failover() { if request_definitely_not_sent { @@ -360,6 +361,18 @@ mod tests { } } + fn make_http_error_status(status: CosmosStatus) -> TransportResult { + TransportResult { + outcome: TransportOutcome::HttpError { + status, + headers: azure_core::http::headers::Headers::new(), + cosmos_headers: CosmosResponseHeaders::default(), + body: vec![], + request_sent: RequestSentStatus::Sent, + }, + } + } + #[test] fn success_completes() { let op = make_read_operation(); @@ -495,6 +508,55 @@ mod tests { assert!(matches!(action, OperationAction::Abort { .. })); } + #[test] + fn partition_topology_gone_aborts_for_dataflow_handling() { + let op = make_read_operation(); + let result = make_http_error_status( + CosmosStatus::new(StatusCode::Gone) + .with_sub_status(SubStatusCode::PARTITION_KEY_RANGE_GONE.value()), + ); + let state = OperationRetryState::initial(0, false, Vec::new(), 3, 1); + let endpoint = CosmosEndpoint::global( + url::Url::parse("https://test.documents.azure.com:443/").unwrap(), + ); + + let (action, effects) = evaluate_transport_result(&op, &endpoint, result, &state); + + match action { + OperationAction::Abort { status, .. } => { + assert_eq!( + status, + Some( + CosmosStatus::new(StatusCode::Gone) + .with_sub_status(SubStatusCode::PARTITION_KEY_RANGE_GONE.value()) + ) + ); + } + other => panic!("expected abort, got {other:?}"), + } + assert!(effects.is_empty()); + } + + #[test] + fn non_topology_gone_still_retries() { + let op = make_read_operation(); + let result = make_http_error_status( + CosmosStatus::new(StatusCode::Gone) + .with_sub_status(SubStatusCode::NAME_CACHE_STALE.value()), + ); + let state = OperationRetryState::initial(0, false, Vec::new(), 3, 1); + let endpoint = CosmosEndpoint::global( + url::Url::parse("https://test.documents.azure.com:443/").unwrap(), + ); + + let (action, effects) = evaluate_transport_result(&op, &endpoint, result, &state); + + assert!(matches!(action, OperationAction::FailoverRetry { .. })); + assert!(effects + .iter() + .any(|e| matches!(e, LocationEffect::MarkEndpointUnavailable { .. }))); + } + #[test] fn write_forbidden_triggers_failover_and_refresh_effect() { let op = make_create_operation(); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs index 4e666d3e9c7..b8d8c36eb4d 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs @@ -24,6 +24,7 @@ pub(crate) mod request_header_names { pub const IS_BATCH_REQUEST: &str = "x-ms-cosmos-is-batch-request"; pub const BATCH_ATOMIC: &str = "x-ms-cosmos-batch-atomic"; pub const BATCH_CONTINUE_ON_ERROR: &str = "x-ms-cosmos-batch-continue-on-error"; + pub const CONTINUATION: &str = "x-ms-continuation"; pub const OFFER_THROUGHPUT: &str = "x-ms-offer-throughput"; pub const OFFER_AUTOPILOT_SETTINGS: &str = "x-ms-cosmos-offer-autopilot-settings"; pub const PRIORITY_LEVEL: &str = "x-ms-cosmos-priority-level"; diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_status.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_status.rs index 1b11373585d..f911afd417b 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_status.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_status.rs @@ -1288,6 +1288,19 @@ impl CosmosStatus { && self.sub_status == Some(SubStatusCode::PARTITION_KEY_RANGE_GONE) } + /// Returns `true` if this is an HTTP 410 caused by partition topology changing. + pub(crate) fn is_partition_topology_change(&self) -> bool { + u16::from(self.status_code) == 410 + && matches!( + self.sub_status, + Some( + SubStatusCode::PARTITION_KEY_RANGE_GONE + | SubStatusCode::COMPLETING_SPLIT + | SubStatusCode::COMPLETING_PARTITION_MIGRATION + ) + ) + } + /// Returns `true` if this indicates a transport-generated 503 (client-side). pub fn is_transport_generated_503(&self) -> bool { u16::from(self.status_code) == 503 From 3c3f8007e42abcd3a980137ed4223b7534af3d5a Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 12:38:45 -0700 Subject: [PATCH 14/29] Add OperationTarget and driver FeedRange Introduce OperationTarget enum (None, PartitionKey, FeedRange) to replace the partition_key field on CosmosOperation. This models the three mutually-exclusive partition targeting modes at the type level: non-partitioned, single logical partition, and feed range. Create a driver-internal FeedRange type with EPK bounds for pipeline routing. Add SDK<->driver FeedRange conversion methods. Require OperationTarget in CosmosOperation::new() so all factory methods must explicitly specify their target. Update execute_operation to route based on target variant. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azure_data_cosmos/src/feed_range.rs | 32 ++++ .../src/driver/cosmos_driver.rs | 20 ++- .../src/models/cosmos_operation.rs | 144 ++++++++++++------ .../src/models/feed_range.rs | 137 +++++++++++++++++ .../src/models/mod.rs | 4 + .../src/models/operation_target.rs | 46 ++++++ 6 files changed, 330 insertions(+), 53 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/feed_range.rs b/sdk/cosmos/azure_data_cosmos/src/feed_range.rs index cde34347c66..848685611dd 100644 --- a/sdk/cosmos/azure_data_cosmos/src/feed_range.rs +++ b/sdk/cosmos/azure_data_cosmos/src/feed_range.rs @@ -240,6 +240,38 @@ impl FeedRange { max_exclusive: max, }) } + /// Converts this SDK `FeedRange` into the driver's `FeedRange` type. + /// + /// The driver's `FeedRange` is used internally for pipeline routing and + /// does not carry serialization logic. + #[allow( + dead_code, + reason = "will be used when query/change-feed operations target feed ranges" + )] + pub(crate) fn to_driver_feed_range(&self) -> azure_data_cosmos_driver::models::FeedRange { + azure_data_cosmos_driver::models::FeedRange::new( + azure_data_cosmos_driver::models::effective_partition_key::EffectivePartitionKey::from( + self.min_inclusive.as_str(), + ), + azure_data_cosmos_driver::models::effective_partition_key::EffectivePartitionKey::from( + self.max_exclusive.as_str(), + ), + ) + } + + /// Creates an SDK `FeedRange` from the driver's `FeedRange` type. + #[allow( + dead_code, + reason = "will be used when query/change-feed operations target feed ranges" + )] + pub(crate) fn from_driver_feed_range( + driver_range: &azure_data_cosmos_driver::models::FeedRange, + ) -> Self { + Self { + min_inclusive: EffectivePartitionKey::from(driver_range.min_inclusive().as_str()), + max_exclusive: EffectivePartitionKey::from(driver_range.max_exclusive().as_str()), + } + } } impl fmt::Display for FeedRange { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index e646a72bb1d..735e2da770a 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -1007,10 +1007,24 @@ impl CosmosDriver { } tracing::debug!("operation started"); - let Some(partition_key) = operation.partition_key().cloned() else { - return self.execute_operation_direct(&operation, &options).await; - }; + match operation.target() { + crate::models::OperationTarget::None => { + return self.execute_operation_direct(&operation, &options).await; + } + crate::models::OperationTarget::FeedRange(_) => { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "FeedRange targeting is not yet implemented for execute_operation; \ + use the dataflow pipeline directly for feed range operations", + )); + } + crate::models::OperationTarget::PartitionKey(_) => {} + } + let partition_key = operation + .partition_key() + .expect("PartitionKey target matched above but partition_key() returned None") + .clone(); let target = RequestTarget::logical_partition_key(partition_key); let root = Request::new(operation, target); let mut pipeline = Pipeline::new(Box::new(root)); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index b120762d34a..81029812f0e 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -5,7 +5,8 @@ use crate::models::{ AccountReference, ContainerReference, CosmosRequestHeaders, CosmosResourceReference, - DatabaseReference, ItemReference, OperationType, PartitionKey, Precondition, ResourceType, + DatabaseReference, ItemReference, OperationTarget, OperationType, PartitionKey, Precondition, + ResourceType, }; use std::borrow::Cow; @@ -62,8 +63,8 @@ pub struct CosmosOperation { resource_type: ResourceType, /// Reference to the resource being operated on. resource_reference: CosmosResourceReference, - /// Optional partition key for data plane operations. - partition_key: Option, + /// Describes how the operation targets the partition key space. + target: OperationTarget, /// Additional request headers to include in the request. request_headers: CosmosRequestHeaders, /// Optional request body (raw bytes, schema-agnostic). @@ -111,9 +112,14 @@ impl CosmosOperation { self.resource_reference.container() } - /// Returns the partition key, if set. + /// Returns the operation target. + pub fn target(&self) -> &OperationTarget { + &self.target + } + + /// Returns the partition key if this operation targets a single logical partition. pub fn partition_key(&self) -> Option<&PartitionKey> { - self.partition_key.as_ref() + self.target.partition_key() } /// Returns the request headers. @@ -126,12 +132,6 @@ impl CosmosOperation { self.body.as_deref() } - /// Sets the partition key for the operation. - pub fn with_partition_key(mut self, partition_key: impl Into) -> Self { - self.partition_key = Some(partition_key.into()); - self - } - /// Sets request headers for the operation. pub fn with_request_headers(mut self, headers: CosmosRequestHeaders) -> Self { self.request_headers = headers; @@ -172,10 +172,11 @@ impl CosmosOperation { // ===== Factory Methods ===== - /// Creates a new operation with the specified type and resource reference. + /// Creates a new operation with the specified type, resource reference, and target. fn new( operation_type: OperationType, resource_reference: impl Into, + target: OperationTarget, ) -> Self { let resource_reference = resource_reference.into(); let resource_type = resource_reference.resource_type(); @@ -183,7 +184,7 @@ impl CosmosOperation { operation_type, resource_type, resource_reference, - partition_key: None, + target, request_headers: CosmosRequestHeaders::new(), body: None, } @@ -216,7 +217,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(account) .with_resource_type(ResourceType::Database) .into_feed_reference(); - Self::new(OperationType::Create, resource_ref) + Self::new(OperationType::Create, resource_ref, OperationTarget::None) } /// Reads (lists) all databases in the account. @@ -226,7 +227,7 @@ impl CosmosOperation { let resource_ref = Into::::into(account) .with_resource_type(ResourceType::Database) .into_feed_reference(); - Self::new(OperationType::ReadFeed, resource_ref) + Self::new(OperationType::ReadFeed, resource_ref, OperationTarget::None) } /// Queries databases in the account. @@ -236,7 +237,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(account) .with_resource_type(ResourceType::Database) .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) + Self::new(OperationType::Query, resource_ref, OperationTarget::None) } /// Deletes a database. @@ -259,7 +260,7 @@ impl CosmosOperation { /// ``` pub fn delete_database(database: DatabaseReference) -> Self { let resource_ref: CosmosResourceReference = database.into(); - Self::new(OperationType::Delete, resource_ref) + Self::new(OperationType::Delete, resource_ref, OperationTarget::None) } /// Reads a database's properties from the service. @@ -268,7 +269,7 @@ impl CosmosOperation { /// the system-managed `_rid`, `_ts`, and `_etag`. pub fn read_database(database: DatabaseReference) -> Self { let resource_ref: CosmosResourceReference = database.into(); - Self::new(OperationType::Read, resource_ref) + Self::new(OperationType::Read, resource_ref, OperationTarget::None) } /// Creates a container in a database. @@ -299,7 +300,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(database) .with_resource_type(ResourceType::DocumentCollection) .into_feed_reference(); - Self::new(OperationType::Create, resource_ref) + Self::new(OperationType::Create, resource_ref, OperationTarget::None) } /// Reads (lists) all containers in a database. @@ -309,7 +310,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(database) .with_resource_type(ResourceType::DocumentCollection) .into_feed_reference(); - Self::new(OperationType::ReadFeed, resource_ref) + Self::new(OperationType::ReadFeed, resource_ref, OperationTarget::None) } /// Queries containers in a database. @@ -319,7 +320,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(database) .with_resource_type(ResourceType::DocumentCollection) .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) + Self::new(OperationType::Query, resource_ref, OperationTarget::None) } /// Deletes a container. @@ -354,7 +355,7 @@ impl CosmosOperation { /// ``` pub fn delete_container(container: ContainerReference) -> Self { let resource_ref: CosmosResourceReference = container.into(); - Self::new(OperationType::Delete, resource_ref) + Self::new(OperationType::Delete, resource_ref, OperationTarget::None) } /// Replaces a container's properties. @@ -362,7 +363,7 @@ impl CosmosOperation { /// Use `with_body()` to provide the updated container properties JSON. pub fn replace_container(container: ContainerReference) -> Self { let resource_ref: CosmosResourceReference = container.into(); - Self::new(OperationType::Replace, resource_ref) + Self::new(OperationType::Replace, resource_ref, OperationTarget::None) } /// Reads a container's properties from the service. @@ -371,7 +372,7 @@ impl CosmosOperation { /// including system-managed properties like `_rid`, `_ts`, and `_etag`. pub fn read_container(container: ContainerReference) -> Self { let resource_ref: CosmosResourceReference = container.into(); - Self::new(OperationType::Read, resource_ref) + Self::new(OperationType::Read, resource_ref, OperationTarget::None) } /// Reads a container's properties by database and container name. @@ -386,7 +387,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(database) .with_resource_type(ResourceType::DocumentCollection) .with_name(container_name.into()); - Self::new(OperationType::Read, resource_ref) + Self::new(OperationType::Read, resource_ref, OperationTarget::None) } /// Reads a container's properties by database RID and container RID. @@ -397,7 +398,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(database) .with_resource_type(ResourceType::DocumentCollection) .with_rid(container_rid.into()); - Self::new(OperationType::Read, resource_ref) + Self::new(OperationType::Read, resource_ref, OperationTarget::None) } // ===== Data Plane Factory Methods ===== @@ -440,7 +441,11 @@ impl CosmosOperation { /// ``` pub fn create_item(item: ItemReference) -> Self { let partition_key = item.partition_key().clone(); - Self::new(OperationType::Create, item).with_partition_key(partition_key) + Self::new( + OperationType::Create, + item, + OperationTarget::PartitionKey(partition_key), + ) } /// Reads an item (document) from a container. @@ -477,7 +482,11 @@ impl CosmosOperation { /// ``` pub fn read_item(item: ItemReference) -> Self { let partition_key = item.partition_key().clone(); - Self::new(OperationType::Read, item).with_partition_key(partition_key) + Self::new( + OperationType::Read, + item, + OperationTarget::PartitionKey(partition_key), + ) } /// Deletes an item (document) from a container. @@ -486,7 +495,11 @@ impl CosmosOperation { /// providing all the information needed for the operation. pub fn delete_item(item: ItemReference) -> Self { let partition_key = item.partition_key().clone(); - Self::new(OperationType::Delete, item).with_partition_key(partition_key) + Self::new( + OperationType::Delete, + item, + OperationTarget::PartitionKey(partition_key), + ) } /// Executes a transactional batch of operations against a single partition. @@ -498,7 +511,11 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new(OperationType::Batch, resource_ref).with_partition_key(partition_key) + Self::new( + OperationType::Batch, + resource_ref, + OperationTarget::PartitionKey(partition_key), + ) } /// Upserts (creates or replaces) an item (document) in a container. @@ -509,7 +526,11 @@ impl CosmosOperation { /// If an item with the same ID exists, it will be replaced; otherwise, a new item is created. pub fn upsert_item(item: ItemReference) -> Self { let partition_key = item.partition_key().clone(); - Self::new(OperationType::Upsert, item).with_partition_key(partition_key) + Self::new( + OperationType::Upsert, + item, + OperationTarget::PartitionKey(partition_key), + ) } /// Replaces an existing item (document) in a container. @@ -519,7 +540,11 @@ impl CosmosOperation { /// Use `with_body()` to provide the new document JSON. pub fn replace_item(item: ItemReference) -> Self { let partition_key = item.partition_key().clone(); - Self::new(OperationType::Replace, item).with_partition_key(partition_key) + Self::new( + OperationType::Replace, + item, + OperationTarget::PartitionKey(partition_key), + ) } /// Reads (lists) all items within a single partition. @@ -530,7 +555,11 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new(OperationType::ReadFeed, resource_ref).with_partition_key(partition_key) + Self::new( + OperationType::ReadFeed, + resource_ref, + OperationTarget::PartitionKey(partition_key), + ) } /// Reads (lists) all items across all partitions. @@ -544,7 +573,11 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new(OperationType::ReadFeed, resource_ref) + Self::new( + OperationType::ReadFeed, + resource_ref, + OperationTarget::FeedRange(crate::models::FeedRange::full()), + ) } /// Queries items within a single partition. @@ -555,22 +588,29 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new(OperationType::Query, resource_ref).with_partition_key(partition_key) + Self::new( + OperationType::Query, + resource_ref, + OperationTarget::PartitionKey(partition_key), + ) } /// Queries items across all partitions. /// /// Use `with_body()` to provide the query JSON. /// - /// This is equivalent to calling `query_items()` with [`PartitionKey::EMPTY`], - /// which causes the `x-ms-documentdb-query-enablecrosspartition` header to be - /// emitted by the pipeline. - /// /// **Warning:** Cross-partition queries are inherently less efficient than /// single-partition queries. Use `query_items()` with a partition key /// when possible. pub fn query_items_cross_partition(container: ContainerReference) -> Self { - Self::query_items(container, PartitionKey::EMPTY) + let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) + .with_resource_type(ResourceType::Document) + .into_feed_reference(); + Self::new( + OperationType::Query, + resource_ref, + OperationTarget::FeedRange(crate::models::FeedRange::full()), + ) } /// Returns true if this is a read-only operation. @@ -593,7 +633,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(account) .with_resource_type(ResourceType::Offer) .into_feed_reference(); - Self::new(OperationType::Query, resource_ref) + Self::new(OperationType::Query, resource_ref, OperationTarget::None) } /// Reads a specific offer by its ID. @@ -603,7 +643,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(account) .with_resource_type(ResourceType::Offer) .with_rid(offer_id.into()); - Self::new(OperationType::Read, resource_ref) + Self::new(OperationType::Read, resource_ref, OperationTarget::None) } /// Replaces a specific offer by its ID. @@ -617,7 +657,7 @@ impl CosmosOperation { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(account) .with_resource_type(ResourceType::Offer) .with_rid(offer_id.into()); - Self::new(OperationType::Replace, resource_ref) + Self::new(OperationType::Replace, resource_ref, OperationTarget::None) } } @@ -666,7 +706,7 @@ mod tests { let item_ref = ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Create, resource_ref); + let op = CosmosOperation::new(OperationType::Create, resource_ref, OperationTarget::None); assert_eq!(op.operation_type(), OperationType::Create); assert_eq!(op.resource_type(), ResourceType::Document); @@ -679,7 +719,7 @@ mod tests { let item_ref = ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Read, resource_ref); + let op = CosmosOperation::new(OperationType::Read, resource_ref, OperationTarget::None); assert_eq!(op.operation_type(), OperationType::Read); assert_eq!(op.resource_type(), ResourceType::Document); @@ -692,8 +732,11 @@ mod tests { let item_ref = ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Read, resource_ref) - .with_partition_key(PartitionKey::from("pk1")); + let op = CosmosOperation::new( + OperationType::Read, + resource_ref, + OperationTarget::PartitionKey(PartitionKey::from("pk1")), + ); assert!(op.partition_key().is_some()); } @@ -704,7 +747,8 @@ mod tests { ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); let body = b"{\"id\":\"doc1\"}".to_vec(); - let op = CosmosOperation::new(OperationType::Create, resource_ref).with_body(body.clone()); + let op = CosmosOperation::new(OperationType::Create, resource_ref, OperationTarget::None) + .with_body(body.clone()); assert_eq!(op.body(), Some(body.as_slice())); } @@ -714,7 +758,7 @@ mod tests { let item_ref = ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Replace, resource_ref); + let op = CosmosOperation::new(OperationType::Replace, resource_ref, OperationTarget::None); assert!(!op.is_read_only()); assert!(op.is_idempotent()); @@ -725,7 +769,7 @@ mod tests { let item_ref = ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Upsert, resource_ref); + let op = CosmosOperation::new(OperationType::Upsert, resource_ref, OperationTarget::None); assert!(!op.is_read_only()); assert!(!op.is_idempotent()); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs new file mode 100644 index 00000000000..12c37d8b1c7 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs @@ -0,0 +1,137 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Feed range type for the Cosmos DB driver. +//! +//! A [`FeedRange`] represents a contiguous range of the effective partition key (EPK) space. +//! It is used by the dataflow pipeline to target operations at one or more physical partitions. + +use crate::models::effective_partition_key::EffectivePartitionKey; + +/// A contiguous range of the effective partition key space. +/// +/// Defined by `[min_inclusive, max_exclusive)` EPK boundaries. A `FeedRange` may +/// map to one or several physical partitions depending on the current partition +/// topology. +/// +/// Use [`FeedRange::full()`] for the entire key space (`""..FF`). +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct FeedRange { + min_inclusive: EffectivePartitionKey, + max_exclusive: EffectivePartitionKey, +} + +impl FeedRange { + /// Creates a feed range from explicit EPK bounds. + pub fn new(min_inclusive: EffectivePartitionKey, max_exclusive: EffectivePartitionKey) -> Self { + Self { + min_inclusive, + max_exclusive, + } + } + + /// Creates a feed range covering the entire partition key space (`""..FF`). + pub fn full() -> Self { + Self { + min_inclusive: EffectivePartitionKey::min(), + max_exclusive: EffectivePartitionKey::max(), + } + } + + /// Returns the inclusive lower bound of this range. + pub fn min_inclusive(&self) -> &EffectivePartitionKey { + &self.min_inclusive + } + + /// Returns the exclusive upper bound of this range. + pub fn max_exclusive(&self) -> &EffectivePartitionKey { + &self.max_exclusive + } + + /// Returns `true` if this feed range is entirely contained within `other`. + pub fn is_subset_of(&self, other: &FeedRange) -> bool { + other.min_inclusive <= self.min_inclusive && other.max_exclusive >= self.max_exclusive + } + + /// Returns `true` if this feed range and `other` share any portion of the EPK space. + /// + /// Two feed ranges overlap when one starts before the other ends and vice versa. + pub fn overlaps(&self, other: &FeedRange) -> bool { + self.min_inclusive < other.max_exclusive && other.min_inclusive < self.max_exclusive + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn full_range() { + let full = FeedRange::full(); + assert_eq!(full.min_inclusive().as_str(), ""); + assert_eq!(full.max_exclusive().as_str(), "FF"); + } + + #[test] + fn is_subset_of_full() { + let full = FeedRange::full(); + let sub = FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("80"), + ); + assert!(sub.is_subset_of(&full)); + assert!(!full.is_subset_of(&sub)); + } + + #[test] + fn is_subset_of_self() { + let range = FeedRange::new( + EffectivePartitionKey::from("20"), + EffectivePartitionKey::from("80"), + ); + assert!(range.is_subset_of(&range)); + } + + #[test] + fn overlaps_basic() { + let a = FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("50"), + ); + let b = FeedRange::new( + EffectivePartitionKey::from("30"), + EffectivePartitionKey::from("80"), + ); + assert!(a.overlaps(&b)); + assert!(b.overlaps(&a)); + } + + #[test] + fn overlaps_adjacent_no_overlap() { + let a = FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("50"), + ); + let b = FeedRange::new( + EffectivePartitionKey::from("50"), + EffectivePartitionKey::from("FF"), + ); + // Adjacent ranges (a's max == b's min) do NOT overlap because max is exclusive. + assert!(!a.overlaps(&b)); + assert!(!b.overlaps(&a)); + } + + #[test] + fn overlaps_disjoint() { + let a = FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("30"), + ); + let b = FeedRange::new( + EffectivePartitionKey::from("50"), + EffectivePartitionKey::from("FF"), + ); + assert!(!a.overlaps(&b)); + assert!(!b.overlaps(&a)); + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs index 841c0d47800..15ebf1ee411 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs @@ -31,8 +31,10 @@ pub(crate) mod vector_session_token; pub(crate) use cosmos_headers::request_header_names; #[allow(dead_code)] pub mod effective_partition_key; +mod feed_range; #[allow(dead_code)] mod murmur_hash; +mod operation_target; #[allow(dead_code)] pub mod partition_key_range; #[allow(dead_code)] @@ -53,6 +55,8 @@ pub use cosmos_response::CosmosResponse; pub use cosmos_status::CosmosStatus; pub use cosmos_status::SubStatusCode; pub use etag::{ETag, Precondition}; +pub use feed_range::FeedRange; +pub use operation_target::OperationTarget; pub use partition_key::{PartitionKey, PartitionKeyValue}; pub use request_charge::RequestCharge; pub use resource_reference::ContainerReference; diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs new file mode 100644 index 00000000000..2523ff9992c --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Operation targeting for Cosmos DB operations. + +use crate::models::{FeedRange, PartitionKey}; + +/// Describes how an operation targets the partition key space. +/// +/// Every [`CosmosOperation`](crate::models::CosmosOperation) carries an `OperationTarget` +/// that determines how the driver routes the request: +/// +/// - [`None`](Self::None) — account/database/container-level operations that have no +/// partition scope (e.g., create database, read container). +/// - [`PartitionKey`](Self::PartitionKey) — operations scoped to a single logical +/// partition. Always executed as a single request (point operation). +/// - [`FeedRange`](Self::FeedRange) — operations scoped to an EPK range that may +/// span one or more physical partitions (e.g., cross-partition queries). +#[derive(Clone, Debug)] +pub enum OperationTarget { + /// No partition scope. Used for account, database, and container-level operations. + /// + /// It is illegal to use this target for item-level operations inside a container. + None, + + /// Scoped to a single logical partition key. + /// + /// This can always be satisfied by a single request node — no fan-out required. + PartitionKey(PartitionKey), + + /// Scoped to a feed range (EPK range). + /// + /// The range may cover one or more physical partitions, including the full + /// container key space ([`FeedRange::full()`]). + FeedRange(FeedRange), +} + +impl OperationTarget { + /// Returns the partition key if this is a [`PartitionKey`](Self::PartitionKey) target. + pub fn partition_key(&self) -> Option<&PartitionKey> { + match self { + Self::PartitionKey(pk) => Some(pk), + _ => Option::None, + } + } +} From 9c38dabd45108136b72e5f39bc6e8156b3f938bb Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 13:54:15 -0700 Subject: [PATCH 15/29] Flow routing headers down into operation pipeline via `OperationOverrides` --- .../src/driver/cosmos_driver.rs | 73 ++++++---- .../src/driver/dataflow/mod.rs | 12 +- .../src/driver/dataflow/request.rs | 65 ++++----- .../src/driver/pipeline/operation_pipeline.rs | 133 +++++++++++++++--- .../src/models/cosmos_headers.rs | 4 + .../src/models/cosmos_operation.rs | 7 +- .../src/models/operation_target.rs | 10 -- 7 files changed, 192 insertions(+), 112 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 735e2da770a..3640831a26d 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -12,11 +12,12 @@ use crate::{ PartitionRoutingRefresh, Pipeline, PipelineContext, Request, RequestExecutor, RequestTarget, }, + pipeline::operation_pipeline::OperationOverrides, routing::{session_manager::SessionManager, CosmosEndpoint, LocationStateStore}, }, models::{ - request_header_names, AccountEndpoint, AccountReference, ActivityId, ContainerProperties, - ContainerReference, CosmosOperation, CosmosResponse, DatabaseProperties, DatabaseReference, + AccountEndpoint, AccountReference, ActivityId, ContainerProperties, ContainerReference, + CosmosOperation, CosmosResponse, DatabaseProperties, DatabaseReference, }, options::{ ConnectionPoolOptions, DiagnosticsOptions, DriverOptions, OperationOptions, @@ -24,7 +25,6 @@ use crate::{ }, }; use arc_swap::ArcSwap; -use azure_core::http::headers::{HeaderName, HeaderValue}; use futures::future::BoxFuture; use std::error::Error as _; use std::sync::atomic::{AtomicBool, Ordering}; @@ -50,22 +50,37 @@ impl RequestExecutor for DriverRequestExecutor<'_> { fn execute_request<'a>( &'a mut self, operation: &'a CosmosOperation, - _target: &'a RequestTarget, + target: RequestTarget, _partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option<&'a str>, + continuation: Option, ) -> BoxFuture<'a, azure_core::Result> { let driver = self.driver; - let mut options = self.options.clone(); - if let Some(continuation) = continuation { - let mut custom_headers = options.custom_headers().cloned().unwrap_or_default(); - custom_headers.insert( - HeaderName::from_static(request_header_names::CONTINUATION), - HeaderValue::from(continuation.to_owned()), - ); - options = options.with_custom_headers(custom_headers); - } + let overrides = match target { + RequestTarget::LogicalPartitionKey(pk) => OperationOverrides { + partition_key: Some(pk), + continuation, + ..Default::default() + }, + RequestTarget::EffectivePartitionKeyRange { + range, + partition_key_range_id, + } => OperationOverrides { + partition_key_range_id: Some(partition_key_range_id.clone()), + feed_range: Some(range), + continuation, + ..Default::default() + }, + RequestTarget::NonPartitioned => OperationOverrides { + continuation, + ..Default::default() + }, + }; - Box::pin(async move { driver.execute_operation_direct(operation, &options).await }) + Box::pin(async move { + driver + .execute_operation_direct(operation, overrides, &self.options) + .await + }) } } @@ -1007,10 +1022,7 @@ impl CosmosDriver { } tracing::debug!("operation started"); - match operation.target() { - crate::models::OperationTarget::None => { - return self.execute_operation_direct(&operation, &options).await; - } + let mut pipeline = match operation.target() { crate::models::OperationTarget::FeedRange(_) => { return Err(azure_core::Error::with_message( azure_core::error::ErrorKind::Other, @@ -1018,16 +1030,19 @@ impl CosmosDriver { use the dataflow pipeline directly for feed range operations", )); } - crate::models::OperationTarget::PartitionKey(_) => {} - } + crate::models::OperationTarget::None => { + // We can use a single request to perform this operation, because it's not partitioned. + let root = Request::new(operation, RequestTarget::NonPartitioned); + Pipeline::new(Box::new(root)) + } + crate::models::OperationTarget::PartitionKey(pk) => { + // We can use a single request to perform this operation, even if it's a query. + let target = RequestTarget::LogicalPartitionKey(pk.clone()); + let root = Request::new(operation, target); + Pipeline::new(Box::new(root)) + } + }; - let partition_key = operation - .partition_key() - .expect("PartitionKey target matched above but partition_key() returned None") - .clone(); - let target = RequestTarget::logical_partition_key(partition_key); - let root = Request::new(operation, target); - let mut pipeline = Pipeline::new(Box::new(root)); let mut executor = DriverRequestExecutor { driver: self, options: &options, @@ -1046,6 +1061,7 @@ impl CosmosDriver { async fn execute_operation_direct( &self, operation: &CosmosOperation, + overrides: OperationOverrides, options: &OperationOptions, ) -> azure_core::Result { // Step 1: Build the single OperationOptionsView for layered resolution. @@ -1125,6 +1141,7 @@ impl CosmosDriver { // Step 7: Execute via the new operation pipeline super::pipeline::operation_pipeline::execute_operation_pipeline( operation, + overrides, &effective_options, options.custom_headers(), self.location_state_store.as_ref(), diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs index ee4434cd37f..9d2bd965667 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -26,9 +26,9 @@ pub(crate) trait RequestExecutor: Send { fn execute_request<'a>( &'a mut self, operation: &'a CosmosOperation, - target: &'a RequestTarget, + target: RequestTarget, partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option<&'a str>, + continuation: Option, ) -> BoxFuture<'a, azure_core::Result>; } @@ -46,9 +46,9 @@ impl<'a> PipelineContext<'a> { async fn execute_request( &mut self, operation: &CosmosOperation, - target: &RequestTarget, + target: RequestTarget, partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option<&str>, + continuation: Option, ) -> azure_core::Result { self.request_executor .execute_request(operation, target, partition_routing_refresh, continuation) @@ -138,9 +138,9 @@ mod tests { fn execute_request<'a>( &'a mut self, _operation: &'a CosmosOperation, - _target: &'a RequestTarget, + _target: RequestTarget, _partition_routing_refresh: PartitionRoutingRefresh, - _continuation: Option<&'a str>, + _continuation: Option, ) -> BoxFuture<'a, azure_core::Result> { Box::pin(async { Err(azure_core::Error::with_message( diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index 55e8bd216fd..a671fd2868c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -3,51 +3,31 @@ //! Request leaf node for the dataflow pipeline. -use std::ops::Range; - use async_trait::async_trait; use azure_core::http::StatusCode; -use crate::models::{ - effective_partition_key::EffectivePartitionKey, CosmosOperation, CosmosResponse, PartitionKey, - SubStatusCode, -}; +use crate::models::{CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode}; use super::{PartitionRoutingRefresh, PipelineContext, PipelineNode}; /// The target of a request node. #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) enum RequestTarget { + /// The request is to a non-partitioned resource (databases, containers, offers, etc.) + NonPartitioned, + /// A single logical partition key. LogicalPartitionKey(PartitionKey), /// An effective partition key range believed to be in one physical partition. EffectivePartitionKeyRange { /// EPK range scoped by this request. - range: Range, + range: FeedRange, /// Partition key range ID believed to contain `range`. partition_key_range_id: String, }, } -impl RequestTarget { - /// Creates a logical partition key target. - pub(crate) fn logical_partition_key(partition_key: PartitionKey) -> Self { - Self::LogicalPartitionKey(partition_key) - } - - /// Creates an EPK range target believed to be contained by one physical partition. - pub(crate) fn effective_partition_key_range( - range: Range, - partition_key_range_id: impl Into, - ) -> Self { - Self::EffectivePartitionKeyRange { - range, - partition_key_range_id: partition_key_range_id.into(), - } - } -} - /// Leaf node that executes one Cosmos DB request per page. pub(crate) struct Request { operation: CosmosOperation, @@ -101,9 +81,9 @@ impl PipelineNode for Request { match context .execute_request( &self.operation, - &self.target, + self.target.clone(), PartitionRoutingRefresh::UseCached, - self.latest_server_continuation.as_deref(), + self.latest_server_continuation.clone(), ) .await { @@ -127,6 +107,10 @@ impl Request { error: azure_core::Error, ) -> azure_core::Result> { match &self.target { + RequestTarget::NonPartitioned => { + // Non-partitioned resources don't have partition topology changes. + Err(error) + } RequestTarget::LogicalPartitionKey(_) => { if self.logical_partition_topology_retry_used { return Err(error); @@ -136,9 +120,9 @@ impl Request { context .execute_request( &self.operation, - &self.target, + self.target.clone(), PartitionRoutingRefresh::ForceRefresh, - self.latest_server_continuation.as_deref(), + self.latest_server_continuation.clone(), ) .await .map(|response| self.record_response_continuation(response)) @@ -191,7 +175,8 @@ mod tests { diagnostics::DiagnosticsContextBuilder, driver::dataflow::RequestExecutor, models::{ - AccountReference, ActivityId, CosmosResponseHeaders, CosmosStatus, DatabaseReference, + effective_partition_key::EffectivePartitionKey, AccountReference, ActivityId, + CosmosResponseHeaders, CosmosStatus, DatabaseReference, }, options::DiagnosticsOptions, }; @@ -216,13 +201,12 @@ mod tests { fn execute_request<'a>( &'a mut self, _operation: &'a CosmosOperation, - _target: &'a RequestTarget, + _target: RequestTarget, partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option<&'a str>, + continuation: Option, ) -> BoxFuture<'a, azure_core::Result> { self.refresh_calls.push(partition_routing_refresh); - self.continuation_calls - .push(continuation.map(str::to_owned)); + self.continuation_calls.push(continuation); let response = self.responses.pop_front().expect("mock request response"); Box::pin(async move { response }) } @@ -238,14 +222,17 @@ mod tests { } fn logical_partition_target() -> RequestTarget { - RequestTarget::logical_partition_key(PartitionKey::from("pk")) + RequestTarget::LogicalPartitionKey(PartitionKey::from("pk")) } fn epk_range_target() -> RequestTarget { - RequestTarget::effective_partition_key_range( - EffectivePartitionKey::from("00")..EffectivePartitionKey::from("80"), - "0", - ) + RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::min(), + EffectivePartitionKey::from("80"), + ), + partition_key_range_id: "0".to_string(), + } } fn response(body: &[u8]) -> CosmosResponse { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs index fc5ad4755fe..04f50df1ca8 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs @@ -38,6 +38,72 @@ use crate::driver::transport::{ AuthorizationContext, CosmosTransport, }; +/// Per-request overrides that take precedence over values from [`CosmosOperation`]. +/// +/// Used by the dataflow pipeline to inject routing and pagination state that +/// varies per physical partition or per page, without mutating the shared +/// `CosmosOperation`. Each field, when `Some`, emits the corresponding request +/// header in [`OperationOverrides::apply_headers`]. +#[derive(Debug, Clone, Default)] +pub(crate) struct OperationOverrides { + /// Feed range to constrain the request to (emits `x-ms-start-epk` / `x-ms-end-epk`). + pub feed_range: Option, + + /// Physical partition key range ID (emits `x-ms-documentdb-partitionkeyrangeid`). + pub partition_key_range_id: Option, + + /// Logical partition key (emits `x-ms-documentdb-partitionkey`). + pub partition_key: Option, + + /// Continuation token for pagination (emits `x-ms-continuation`). + pub continuation: Option, +} + +impl OperationOverrides { + /// Applies the override headers to the given header map. + /// + /// Headers set here take precedence over any previously-set values for + /// the same header name (they overwrite on conflict). + pub fn apply_headers( + &self, + headers: &mut azure_core::http::headers::Headers, + ) -> azure_core::Result<()> { + if let Some(feed_range) = &self.feed_range { + headers.insert( + HeaderName::from_static(request_header_names::START_EPK), + HeaderValue::from(feed_range.min_inclusive().as_str().to_owned()), + ); + headers.insert( + HeaderName::from_static(request_header_names::END_EPK), + HeaderValue::from(feed_range.max_exclusive().as_str().to_owned()), + ); + } + + if let Some(pk_range_id) = &self.partition_key_range_id { + headers.insert( + HeaderName::from_static(request_header_names::PARTITION_KEY_RANGE_ID), + HeaderValue::from(pk_range_id.clone()), + ); + } + + if let Some(pk) = &self.partition_key { + let pk_headers = pk.as_headers()?; + for (name, value) in pk_headers { + headers.insert(name, value); + } + } + + if let Some(continuation) = &self.continuation { + headers.insert( + HeaderName::from_static(request_header_names::CONTINUATION), + HeaderValue::from(continuation.clone()), + ); + } + + Ok(()) + } +} + /// Executes a Cosmos DB operation through the new pipeline architecture. /// /// This is the entry point called by `CosmosDriver::execute_operation`. @@ -45,6 +111,7 @@ use crate::driver::transport::{ #[allow(clippy::too_many_arguments)] pub(crate) async fn execute_operation_pipeline( operation: &CosmosOperation, + overrides: OperationOverrides, options: &OperationOptionsView<'_>, custom_headers: Option<&std::collections::HashMap>, location_state_store: &LocationStateStore, @@ -147,7 +214,8 @@ pub(crate) async fn execute_operation_pipeline( .flatten(), throughput_control, }; - let mut transport_request = build_transport_request(operation, custom_headers, &ctx)?; + let mut transport_request = + build_transport_request(operation, &overrides, custom_headers, &ctx)?; // Apply content-response-on-write preference. // By default, (None or Disabled), suppress the response body for write @@ -439,8 +507,11 @@ struct TransportRequestContext<'a> { /// Builds a `TransportRequest` from the operation and routing decision. /// /// If `resolved_session_token` is provided, it is added to the request headers. +/// Override headers from `overrides` are applied after operation headers, so they +/// take precedence. fn build_transport_request( operation: &CosmosOperation, + overrides: &OperationOverrides, custom_headers: Option<&std::collections::HashMap>, ctx: &TransportRequestContext<'_>, ) -> azure_core::Result { @@ -485,14 +556,6 @@ fn build_transport_request( ); } - // Add partition key headers - if let Some(pk) = operation.partition_key() { - let pk_headers = pk.as_headers()?; - for (name, value) in pk_headers { - headers.insert(name, value); - } - } - // Cosmos DB uses POST for both create and upsert; the service // distinguishes them via this header. if operation.operation_type() == OperationType::Upsert { @@ -533,6 +596,10 @@ fn build_transport_request( } } + // Apply overrides — these take precedence over operation-level headers + // (e.g., an override partition key replaces the operation's partition key). + overrides.apply_headers(&mut headers)?; + // Add resolved session token if let Some(token) = &ctx.resolved_session_token { headers.insert( @@ -641,6 +708,7 @@ mod tests { use url::Url; use super::build_transport_request; + use super::OperationOverrides; use super::TransportRequestContext; use crate::{ diagnostics::ExecutionContext, @@ -711,7 +779,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert_eq!(request.url.path(), "/dbs"); } @@ -732,7 +801,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert_eq!(request.url.path(), "/dbs/mydb"); } @@ -753,7 +823,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); let activity_header = request .headers @@ -778,8 +849,12 @@ mod tests { resolved_session_token: None, throughput_control: None, }; - let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + let overrides = OperationOverrides { + partition_key: Some(PartitionKey::from("pk1")), + ..Default::default() + }; + let request = build_transport_request(&operation, &overrides, None, &ctx) + .expect("request should build"); let partition_key_header = request .headers @@ -812,7 +887,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert_eq!( request.url.as_str(), @@ -842,7 +918,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert_eq!( request.url.as_str(), @@ -1279,7 +1356,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); let is_upsert = request .headers @@ -1312,7 +1390,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert!( request @@ -1346,7 +1425,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert_eq!( request @@ -1392,7 +1472,8 @@ mod tests { throughput_control: None, }; let request = - build_transport_request(&operation, None, &ctx).expect("request should build"); + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .expect("request should build"); assert!( request @@ -1429,7 +1510,9 @@ mod tests { resolved_session_token: None, throughput_control: Some(&snapshot), }; - let request = build_transport_request(&operation, None, &ctx).unwrap(); + let request = + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .unwrap(); let priority = request .headers @@ -1472,7 +1555,9 @@ mod tests { resolved_session_token: None, throughput_control: Some(&snapshot), }; - let request = build_transport_request(&operation, None, &ctx).unwrap(); + let request = + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .unwrap(); let bucket = request .headers @@ -1516,7 +1601,9 @@ mod tests { resolved_session_token: None, throughput_control: Some(&snapshot), }; - let request = build_transport_request(&operation, None, &ctx).unwrap(); + let request = + build_transport_request(&operation, &OperationOverrides::default(), None, &ctx) + .unwrap(); assert_eq!( request.headers.get_optional_str(&HeaderName::from_static( diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs index b8d8c36eb4d..22c60131fc2 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs @@ -29,6 +29,10 @@ pub(crate) mod request_header_names { pub const OFFER_AUTOPILOT_SETTINGS: &str = "x-ms-cosmos-offer-autopilot-settings"; pub const PRIORITY_LEVEL: &str = "x-ms-cosmos-priority-level"; pub const THROUGHPUT_BUCKET: &str = "x-ms-cosmos-throughput-bucket"; + pub const START_EPK: &str = "x-ms-start-epk"; + pub const END_EPK: &str = "x-ms-end-epk"; + pub const PARTITION_KEY: &str = "x-ms-documentdb-partitionkey"; + pub const PARTITION_KEY_RANGE_ID: &str = "x-ms-documentdb-partitionkeyrangeid"; } /// Standard Cosmos DB response header names. diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index 81029812f0e..153f7bd6a7a 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -117,11 +117,6 @@ impl CosmosOperation { &self.target } - /// Returns the partition key if this operation targets a single logical partition. - pub fn partition_key(&self) -> Option<&PartitionKey> { - self.target.partition_key() - } - /// Returns the request headers. pub fn request_headers(&self) -> &CosmosRequestHeaders { &self.request_headers @@ -738,7 +733,7 @@ mod tests { OperationTarget::PartitionKey(PartitionKey::from("pk1")), ); - assert!(op.partition_key().is_some()); + assert!(matches!(op.target(), OperationTarget::PartitionKey(_))); } #[test] diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs index 2523ff9992c..0670a88ec46 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs @@ -34,13 +34,3 @@ pub enum OperationTarget { /// container key space ([`FeedRange::full()`]). FeedRange(FeedRange), } - -impl OperationTarget { - /// Returns the partition key if this is a [`PartitionKey`](Self::PartitionKey) target. - pub fn partition_key(&self) -> Option<&PartitionKey> { - match self { - Self::PartitionKey(pk) => Some(pk), - _ => Option::None, - } - } -} From 3b1b8976129b1f5037df83edd081480980801ffc Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 15:04:48 -0700 Subject: [PATCH 16/29] Add an explicit "Plan" API to set up a pipeline --- .../src/driver/cosmos_driver.rs | 24 +-- .../src/driver/dataflow/mod.rs | 15 +- .../src/driver/dataflow/planner.rs | 142 ++++++++++++++++++ .../src/driver/dataflow/request.rs | 1 - .../src/models/mod.rs | 107 +++++++++++++ 5 files changed, 265 insertions(+), 24 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 3640831a26d..18ccd020bf4 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -9,8 +9,7 @@ use crate::{ }, driver::{ dataflow::{ - PartitionRoutingRefresh, Pipeline, PipelineContext, Request, RequestExecutor, - RequestTarget, + planner, PartitionRoutingRefresh, PipelineContext, RequestExecutor, RequestTarget, }, pipeline::operation_pipeline::OperationOverrides, routing::{session_manager::SessionManager, CosmosEndpoint, LocationStateStore}, @@ -1022,26 +1021,7 @@ impl CosmosDriver { } tracing::debug!("operation started"); - let mut pipeline = match operation.target() { - crate::models::OperationTarget::FeedRange(_) => { - return Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "FeedRange targeting is not yet implemented for execute_operation; \ - use the dataflow pipeline directly for feed range operations", - )); - } - crate::models::OperationTarget::None => { - // We can use a single request to perform this operation, because it's not partitioned. - let root = Request::new(operation, RequestTarget::NonPartitioned); - Pipeline::new(Box::new(root)) - } - crate::models::OperationTarget::PartitionKey(pk) => { - // We can use a single request to perform this operation, even if it's a query. - let target = RequestTarget::LogicalPartitionKey(pk.clone()); - let root = Request::new(operation, target); - Pipeline::new(Box::new(root)) - } - }; + let mut pipeline = planner::plan_pipeline(&operation)?; let mut executor = DriverRequestExecutor { driver: self, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs index 9d2bd965667..820e5c5264d 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -3,6 +3,7 @@ //! Dataflow pipeline nodes for paged Cosmos DB operations. +pub(crate) mod planner; mod request; use futures::future::BoxFuture; @@ -62,7 +63,7 @@ impl<'a> PipelineContext<'a> { /// allocation is negligible compared to the multi-millisecond network I/O /// of a Cosmos DB request. #[async_trait::async_trait] -pub(crate) trait PipelineNode: Send { +pub(crate) trait PipelineNode: Send + std::any::Any { /// Emits the next page of results, or `None` when this node is drained. async fn next_page( &mut self, @@ -73,6 +74,13 @@ pub(crate) trait PipelineNode: Send { fn children(&self) -> &[Box]; } +impl dyn PipelineNode { + /// Downcasts this node to a concrete type. + pub(crate) fn downcast_ref(&self) -> Option<&T> { + (self as &dyn std::any::Any).downcast_ref::() + } +} + /// A pipeline root that owns the node tree. pub(crate) struct Pipeline { root: Box, @@ -84,6 +92,11 @@ impl Pipeline { Self { root } } + /// Returns a reference to the root node. + pub(crate) fn root(&self) -> &dyn PipelineNode { + &*self.root + } + /// Emits the next page from the root node. pub(crate) async fn next_page( &mut self, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs new file mode 100644 index 00000000000..259357409ee --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Pipeline planner for Cosmos DB operations. +//! +//! The planner validates an operation's target against its resource type and +//! constructs the appropriate dataflow [`Pipeline`]. + +use crate::models::{CosmosOperation, OperationTarget}; + +use super::{Pipeline, Request, RequestTarget}; + +/// Validates and builds a [`Pipeline`] for the given operation. +/// +/// This is the "Planning" phase of operation execution. It: +/// 1. Validates that the operation's target is compatible with its resource type. +/// 2. Maps the operation target to a pipeline node tree (currently a single +/// [`Request`] leaf node for point and single-partition operations). +pub(crate) fn plan_pipeline(operation: &CosmosOperation) -> azure_core::Result { + let resource_type = operation.resource_type(); + let target = operation.target(); + + if !resource_type.is_valid_target(target) { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + format!( + "operation target {target_desc} is not valid for resource type {resource_type}", + target_desc = target_description(target), + ), + )); + } + + let request_target = match target { + OperationTarget::None => RequestTarget::NonPartitioned, + OperationTarget::PartitionKey(pk) => RequestTarget::LogicalPartitionKey(pk.clone()), + OperationTarget::FeedRange(_) => { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "FeedRange targeting is not yet implemented; \ + fan-out pipeline planning requires partition resolution", + )); + } + }; + + let root = Request::new(operation.clone(), request_target); + Ok(Pipeline::new(Box::new(root))) +} + +fn target_description(target: &OperationTarget) -> &'static str { + match target { + OperationTarget::None => "None", + OperationTarget::PartitionKey(_) => "PartitionKey", + OperationTarget::FeedRange(_) => "FeedRange", + } +} + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + + use super::*; + use crate::models::{ + AccountReference, ContainerProperties, ContainerReference, DatabaseReference, + ItemReference, OperationType, PartitionKey, PartitionKeyDefinition, ResourceType, + SystemProperties, + }; + + fn test_account() -> AccountReference { + AccountReference::with_master_key( + url::Url::parse("https://test.documents.azure.com:443/").unwrap(), + "dGVzdA==", + ) + } + + fn test_database() -> DatabaseReference { + DatabaseReference::from_name(test_account(), "db".to_owned()) + } + + fn test_partition_key_definition() -> PartitionKeyDefinition { + serde_json::from_str(r#"{"paths":["/pk"]}"#).unwrap() + } + + fn test_container_props() -> ContainerProperties { + ContainerProperties { + id: Cow::Owned("coll".into()), + partition_key: test_partition_key_definition(), + system_properties: SystemProperties::default(), + } + } + + fn test_container() -> ContainerReference { + ContainerReference::new( + test_account(), + "db", + "db_rid", + "coll", + "coll_rid", + &test_container_props(), + ) + } + + // --- plan_pipeline tests --- + + #[test] + fn plans_non_partitioned_pipeline_for_database_read() { + let op = CosmosOperation::read_database(test_database()); + let pipeline = plan_pipeline(&op).unwrap(); + + let request = pipeline.root().downcast_ref::().unwrap(); + assert_eq!(*request.target(), RequestTarget::NonPartitioned); + assert_eq!(request.operation().operation_type(), OperationType::Read); + assert_eq!(request.operation().resource_type(), ResourceType::Database); + } + + #[test] + fn plans_logical_partition_pipeline_for_item_read() { + let pk = PartitionKey::from("pk-value"); + let item = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); + let op = CosmosOperation::read_item(item); + let pipeline = plan_pipeline(&op).unwrap(); + + let request = pipeline.root().downcast_ref::().unwrap(); + assert_eq!( + *request.target(), + RequestTarget::LogicalPartitionKey(pk.clone()) + ); + assert_eq!(request.operation().operation_type(), OperationType::Read); + assert_eq!(request.operation().resource_type(), ResourceType::Document); + } + + #[test] + fn rejects_feed_range_target() { + let op = CosmosOperation::read_all_items_cross_partition(test_container()); + let result = plan_pipeline(&op); + + let err = result.err().expect("expected error for FeedRange target"); + assert!( + err.to_string().contains("FeedRange"), + "expected FeedRange error, got: {err}" + ); + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index a671fd2868c..3618e8547b8 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -99,7 +99,6 @@ impl PipelineNode for Request { &[] } } - impl Request { async fn handle_partition_topology_change( &mut self, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs index 15ebf1ee411..b51af12e8f9 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs @@ -374,6 +374,40 @@ impl ResourceType { ) } + /// Returns true if the given [`OperationTarget`] is valid for this resource type. + /// + /// Each resource type only supports a subset of targeting modes: + /// - Non-partitioned resources (`DatabaseAccount`, `Database`, `DocumentCollection`, + /// `PartitionKeyRange`, `Offer`) require [`OperationTarget::None`]. + /// - Documents require either a [`OperationTarget::PartitionKey`] or + /// [`OperationTarget::FeedRange`]. + /// - Server-side code resources (`StoredProcedure`, `Trigger`, `UserDefinedFunction`) + /// accept [`OperationTarget::None`] for CRUD and [`OperationTarget::PartitionKey`] + /// for execution. + pub fn is_valid_target(self, target: &OperationTarget) -> bool { + match self { + ResourceType::DatabaseAccount + | ResourceType::Database + | ResourceType::DocumentCollection + | ResourceType::PartitionKeyRange + | ResourceType::Offer => matches!(target, OperationTarget::None), + + ResourceType::Document => matches!( + target, + OperationTarget::PartitionKey(_) | OperationTarget::FeedRange(_) + ), + + ResourceType::StoredProcedure + | ResourceType::Trigger + | ResourceType::UserDefinedFunction => { + matches!( + target, + OperationTarget::None | OperationTarget::PartitionKey(_) + ) + } + } + } + /// Returns true if this resource type requires a database reference. pub fn requires_database(self) -> bool { matches!( @@ -822,4 +856,77 @@ mod tests { // Higher version (2) wins for globalLSN; region 1: max(100, 50) = 100 assert_eq!(merged.as_str(), "0:2#200#1=100"); } + + // --- ResourceType::is_valid_target --- + + #[test] + fn none_target_valid_for_database() { + assert!(ResourceType::Database.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn none_target_valid_for_database_account() { + assert!(ResourceType::DatabaseAccount.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn none_target_valid_for_document_collection() { + assert!(ResourceType::DocumentCollection.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn none_target_valid_for_offer() { + assert!(ResourceType::Offer.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn none_target_valid_for_partition_key_range() { + assert!(ResourceType::PartitionKeyRange.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn none_target_invalid_for_document() { + assert!(!ResourceType::Document.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn partition_key_valid_for_document() { + let pk = OperationTarget::PartitionKey(PartitionKey::from("pk")); + assert!(ResourceType::Document.is_valid_target(&pk)); + } + + #[test] + fn feed_range_valid_for_document() { + let fr = OperationTarget::FeedRange(FeedRange::full()); + assert!(ResourceType::Document.is_valid_target(&fr)); + } + + #[test] + fn partition_key_invalid_for_database() { + let pk = OperationTarget::PartitionKey(PartitionKey::from("pk")); + assert!(!ResourceType::Database.is_valid_target(&pk)); + } + + #[test] + fn feed_range_invalid_for_database() { + let fr = OperationTarget::FeedRange(FeedRange::full()); + assert!(!ResourceType::Database.is_valid_target(&fr)); + } + + #[test] + fn none_target_valid_for_stored_procedure() { + assert!(ResourceType::StoredProcedure.is_valid_target(&OperationTarget::None)); + } + + #[test] + fn partition_key_valid_for_stored_procedure() { + let pk = OperationTarget::PartitionKey(PartitionKey::from("pk")); + assert!(ResourceType::StoredProcedure.is_valid_target(&pk)); + } + + #[test] + fn feed_range_invalid_for_stored_procedure() { + let fr = OperationTarget::FeedRange(FeedRange::full()); + assert!(!ResourceType::StoredProcedure.is_valid_target(&fr)); + } } From 067538b826bc69a07bad256326c25deac6811af8 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 15:22:14 -0700 Subject: [PATCH 17/29] Add SequentialDrain, split recovery, topology Build the SequentialDrain pipeline node for cross-partition feed operations. It drains children left-to-right by EPK range, handling partition splits by splicing replacement nodes at the current position. - Add PageResult enum (Page/Drained/SplitRequired) and ChildNodes enum for correct VecDeque iteration - Add TopologyProvider trait and CachedTopologyProvider adapter backed by PartitionKeyRangeCache - Update Request node with EPK split recovery via TopologyProvider, returning SplitRequired to parent - Make TopologyProvider required on PipelineContext - Extract shared test mocks to dataflow/mocks.rs - 35 dataflow tests covering draining, splits, errors Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/driver/cosmos_driver.rs | 25 +- .../src/driver/dataflow/drain.rs | 505 ++++++++++++++++++ .../src/driver/dataflow/mocks.rs | 249 +++++++++ .../src/driver/dataflow/mod.rs | 230 +++++--- .../src/driver/dataflow/request.rs | 400 +++++++++----- .../src/driver/dataflow/topology.rs | 272 ++++++++++ 6 files changed, 1469 insertions(+), 212 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 18ccd020bf4..a4589002a8a 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -10,6 +10,7 @@ use crate::{ driver::{ dataflow::{ planner, PartitionRoutingRefresh, PipelineContext, RequestExecutor, RequestTarget, + ResolvedRange, TopologyProvider, }, pipeline::operation_pipeline::OperationOverrides, routing::{session_manager::SessionManager, CosmosEndpoint, LocationStateStore}, @@ -83,6 +84,27 @@ impl RequestExecutor for DriverRequestExecutor<'_> { } } +/// Stub topology provider for the current single-request pipeline. +/// +/// Cross-partition feed operations will replace this with a +/// [`CachedTopologyProvider`](super::dataflow::CachedTopologyProvider) backed +/// by the driver's partition key range cache. +struct StubTopologyProvider; + +impl TopologyProvider for StubTopologyProvider { + fn resolve_ranges<'a>( + &'a mut self, + _range: &'a crate::models::FeedRange, + ) -> BoxFuture<'a, azure_core::Result>> { + Box::pin(async { + Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "topology resolution not yet wired up for this pipeline", + )) + }) + } +} + /// Cosmos DB driver instance. /// /// A driver represents a connection to a specific Cosmos DB account. It is created @@ -1027,7 +1049,8 @@ impl CosmosDriver { driver: self, options: &options, }; - let mut context = PipelineContext::new(&mut executor); + let mut topology = StubTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); match pipeline.next_page(&mut context).await? { Some(response) => Ok(response), diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs new file mode 100644 index 00000000000..7e419702528 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs @@ -0,0 +1,505 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Sequential drain node for cross-partition feed operations. +//! +//! `SequentialDrain` iterates its children in EPK order (left to right), +//! fully draining one child before advancing to the next. When a child +//! signals a partition split via [`PageResult::SplitRequired`], the drain +//! splices replacement nodes into its children list and retries. + +use std::collections::VecDeque; + +use async_trait::async_trait; + +use super::{ChildNodes, PageResult, PipelineContext, PipelineNode}; + +/// Maximum number of consecutive split retries before giving up. +/// +/// In practice a split produces 2–3 new ranges. This limit prevents infinite +/// loops if the topology provider keeps returning splits. +const MAX_SPLIT_RETRIES: usize = 10; + +/// Drains child nodes sequentially in EPK order. +/// +/// Each call to `next_page` returns the next page from the left-most (lowest EPK) +/// child. When that child is drained, it is removed and the next child becomes active. +/// When all children are drained, the node itself is drained. +pub(crate) struct SequentialDrain { + children: VecDeque>, +} + +impl SequentialDrain { + /// Creates a new sequential drain over the given children. + /// + /// Children must be ordered by EPK range from smallest to largest. + pub(crate) fn new(children: Vec>) -> Self { + Self { + children: children.into(), + } + } +} + +#[async_trait] +impl PipelineNode for SequentialDrain { + async fn next_page( + &mut self, + context: &mut PipelineContext<'_>, + ) -> azure_core::Result { + let mut split_retries = 0; + + loop { + let Some(current) = self.children.front_mut() else { + return Ok(PageResult::Drained); + }; + + match current.next_page(context).await? { + PageResult::Page(response) => return Ok(PageResult::Page(response)), + PageResult::Drained => { + self.children.pop_front(); + // Loop to try the next child. + } + PageResult::SplitRequired { replacement_nodes } => { + split_retries += 1; + if split_retries > MAX_SPLIT_RETRIES { + // This should be ridiculously rare. + // The topology provider already waits for splits to converge before returning. + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + format!( + "exceeded maximum split retries ({MAX_SPLIT_RETRIES}) \ + in SequentialDrain" + ), + )); + } + + // Remove the split child and splice in replacements at the front. + self.children.pop_front(); + for (i, node) in replacement_nodes.into_iter().enumerate() { + self.children.insert(i, node); + } + // Loop to drain the first replacement. + } + } + } + } + + fn children(&self) -> ChildNodes<'_> { + let (front, back) = self.children.as_slices(); + if back.is_empty() { + ChildNodes::Slice(front) + } else { + ChildNodes::Split(front, back) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::driver::dataflow::mocks::*; + + #[tokio::test] + async fn drains_single_child() { + let child = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"p1"))), + Ok(PageResult::Page(response(b"p2"))), + Ok(PageResult::Drained), + ]); + let mut drain = SequentialDrain::new(vec![Box::new(child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"p1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"p2" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn drains_multiple_children_in_order() { + let child1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c1-p1"))), + Ok(PageResult::Drained), + ]); + let child2 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c2-p1"))), + Ok(PageResult::Page(response(b"c2-p2"))), + Ok(PageResult::Drained), + ]); + let child3 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c3-p1"))), + Ok(PageResult::Drained), + ]); + let mut drain = + SequentialDrain::new(vec![Box::new(child1), Box::new(child2), Box::new(child3)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c1-p1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c2-p1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c2-p2" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c3-p1" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn empty_drain_is_immediately_drained() { + let mut drain = SequentialDrain::new(vec![]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn propagates_child_error() { + let child = MockLeaf::with_pages(vec![Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "test error", + ))]); + let mut drain = SequentialDrain::new(vec![Box::new(child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let err = drain.next_page(&mut context).await.unwrap_err(); + assert!(err.to_string().contains("test error")); + } + + #[tokio::test] + async fn handles_split_of_first_child() { + let replacement1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"split-left"))), + Ok(PageResult::Drained), + ]); + let replacement2 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"split-right"))), + Ok(PageResult::Drained), + ]); + + let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(replacement1), Box::new(replacement2)], + })]); + + let trailing_child = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"trailing"))), + Ok(PageResult::Drained), + ]); + + let mut drain = SequentialDrain::new(vec![Box::new(split_child), Box::new(trailing_child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"split-left" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"split-right" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"trailing" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn handles_split_of_middle_child() { + let child1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c1"))), + Ok(PageResult::Drained), + ]); + + let replacement = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c2-split"))), + Ok(PageResult::Drained), + ]); + let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(replacement)], + })]); + + let child3 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c3"))), + Ok(PageResult::Drained), + ]); + + let mut drain = SequentialDrain::new(vec![ + Box::new(child1), + Box::new(split_child), + Box::new(child3), + ]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c2-split" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c3" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn handles_split_of_last_child() { + let child1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c1"))), + Ok(PageResult::Drained), + ]); + + let replacement = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"last-split"))), + Ok(PageResult::Drained), + ]); + let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(replacement)], + })]); + + let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(split_child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"last-split" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn handles_cascading_split() { + let final_leaf = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"final"))), + Ok(PageResult::Drained), + ]); + + let cascading_replacement = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(final_leaf)], + })]); + + let initial_split = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(cascading_replacement)], + })]); + + let mut drain = SequentialDrain::new(vec![Box::new(initial_split)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"final" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn split_retry_limit_prevents_infinite_loop() { + let mut current: Box = Box::new(MockLeaf::with_pages(vec![Ok( + PageResult::Page(response(b"unreachable")), + )])); + + for _ in 0..12 { + current = Box::new(MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![current], + })])); + } + + let mut drain = SequentialDrain::new(vec![current]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let err = drain.next_page(&mut context).await.unwrap_err(); + assert!(err.to_string().contains("split retries")); + } + + #[tokio::test] + async fn child_drained_immediately_skips_to_next() { + let empty_child = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); + let real_child = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"data"))), + Ok(PageResult::Drained), + ]); + + let mut drain = SequentialDrain::new(vec![Box::new(empty_child), Box::new(real_child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"data" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn split_with_three_way_replacement() { + let r1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"r1"))), + Ok(PageResult::Drained), + ]); + let r2 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"r2"))), + Ok(PageResult::Drained), + ]); + let r3 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"r3"))), + Ok(PageResult::Drained), + ]); + + let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(r1), Box::new(r2), Box::new(r3)], + })]); + + let mut drain = SequentialDrain::new(vec![Box::new(split_child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"r1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"r2" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"r3" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn error_after_partial_drain() { + let child1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"ok"))), + Ok(PageResult::Drained), + ]); + let child2 = MockLeaf::with_pages(vec![Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "boom", + ))]); + + let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(child2)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"ok" + ); + let err = drain.next_page(&mut context).await.unwrap_err(); + assert!(err.to_string().contains("boom")); + } + + #[tokio::test] + async fn multiple_pages_per_child_then_advance() { + let child1 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c1-p1"))), + Ok(PageResult::Page(response(b"c1-p2"))), + Ok(PageResult::Page(response(b"c1-p3"))), + Ok(PageResult::Drained), + ]); + let child2 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"c2-p1"))), + Ok(PageResult::Drained), + ]); + + let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(child2)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c1-p1" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c1-p2" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c1-p3" + ); + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"c2-p1" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn split_produces_page_on_same_call() { + let replacement = MockLeaf::with_pages(vec![ + Ok(PageResult::Page(response(b"immediate"))), + Ok(PageResult::Drained), + ]); + + let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { + replacement_nodes: vec![Box::new(replacement)], + })]); + + let mut drain = SequentialDrain::new(vec![Box::new(split_child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + assert_eq!( + unwrap_page(drain.next_page(&mut context).await).body(), + b"immediate" + ); + assert_drained(drain.next_page(&mut context).await); + } + + #[tokio::test] + async fn children_returns_all_nodes() { + let c1 = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); + let c2 = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); + let c3 = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); + + let drain = SequentialDrain::new(vec![Box::new(c1), Box::new(c2), Box::new(c3)]); + assert_eq!(drain.children().len(), 3); + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs new file mode 100644 index 00000000000..7b35c3688c0 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs @@ -0,0 +1,249 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Shared test mocks for dataflow pipeline testing. + +use std::{collections::VecDeque, sync::Arc}; + +use azure_core::http::StatusCode; +use futures::future::BoxFuture; + +use super::{ + ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, + RequestExecutor, RequestTarget, ResolvedRange, TopologyProvider, +}; +use crate::{ + diagnostics::DiagnosticsContextBuilder, + models::{ + effective_partition_key::EffectivePartitionKey, AccountReference, ActivityId, + CosmosOperation, CosmosResponse, CosmosResponseHeaders, CosmosStatus, DatabaseReference, + FeedRange, PartitionKey, SubStatusCode, + }, + options::DiagnosticsOptions, +}; + +// ── Mock pipeline node ────────────────────────────────────────────────────── + +/// A mock leaf node that returns pre-configured page results. +pub(crate) struct MockLeaf { + pages: VecDeque>, +} + +impl MockLeaf { + /// Creates a mock leaf with a sequence of results to return from `next_page`. + pub fn with_pages(pages: Vec>) -> Self { + Self { + pages: pages.into(), + } + } +} + +#[async_trait::async_trait] +impl PipelineNode for MockLeaf { + async fn next_page( + &mut self, + _context: &mut PipelineContext<'_>, + ) -> azure_core::Result { + self.pages + .pop_front() + .expect("MockLeaf: no more page results") + } + + fn children(&self) -> ChildNodes<'_> { + ChildNodes::None + } +} + +// ── Request executors ─────────────────────────────────────────────────────── + +/// A request executor that should never be called. +pub(crate) struct NoopRequestExecutor; + +impl RequestExecutor for NoopRequestExecutor { + fn execute_request<'a>( + &'a mut self, + _operation: &'a CosmosOperation, + _target: RequestTarget, + _partition_routing_refresh: PartitionRoutingRefresh, + _continuation: Option, + ) -> BoxFuture<'a, azure_core::Result> { + Box::pin(async { + Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "noop executor should not be called", + )) + }) + } +} + +/// A mock request executor that records calls and returns pre-configured responses. +pub(crate) struct MockRequestExecutor { + pub responses: VecDeque>, + pub refresh_calls: Vec, + pub continuation_calls: Vec>, +} + +impl MockRequestExecutor { + pub fn new(responses: Vec>) -> Self { + Self { + responses: responses.into(), + refresh_calls: Vec::new(), + continuation_calls: Vec::new(), + } + } +} + +impl RequestExecutor for MockRequestExecutor { + fn execute_request<'a>( + &'a mut self, + _operation: &'a CosmosOperation, + _target: RequestTarget, + partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option, + ) -> BoxFuture<'a, azure_core::Result> { + self.refresh_calls.push(partition_routing_refresh); + self.continuation_calls.push(continuation); + let response = self.responses.pop_front().expect("mock request response"); + Box::pin(async move { response }) + } +} + +// ── Topology providers ───────────────────────────────────────────────────── + +/// A topology provider that should never be called. +pub(crate) struct NoopTopologyProvider; + +impl TopologyProvider for NoopTopologyProvider { + fn resolve_ranges<'a>( + &'a mut self, + _range: &'a FeedRange, + ) -> BoxFuture<'a, azure_core::Result>> { + Box::pin(async { + Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "noop topology provider should not be called", + )) + }) + } +} + +/// A mock topology provider that returns pre-configured resolved ranges. +pub(crate) struct MockTopologyProvider { + results: VecDeque>>, +} + +impl MockTopologyProvider { + pub fn new(results: Vec>>) -> Self { + Self { + results: results.into(), + } + } +} + +impl TopologyProvider for MockTopologyProvider { + fn resolve_ranges<'a>( + &'a mut self, + _range: &'a FeedRange, + ) -> BoxFuture<'a, azure_core::Result>> { + let result = self + .results + .pop_front() + .expect("MockTopologyProvider: no more results"); + Box::pin(async move { result }) + } +} + +// ── Test helpers ──────────────────────────────────────────────────────────── + +/// Extracts the `CosmosResponse` from a `PageResult::Page`, panicking otherwise. +pub(crate) fn unwrap_page(result: azure_core::Result) -> CosmosResponse { + match result.expect("expected Ok result") { + PageResult::Page(r) => r, + PageResult::Drained => panic!("expected Page, got Drained"), + PageResult::SplitRequired { .. } => panic!("expected Page, got SplitRequired"), + } +} + +/// Asserts that a `PageResult` is `Drained`. +pub(crate) fn assert_drained(result: azure_core::Result) { + match result.expect("expected Ok result") { + PageResult::Drained => {} + PageResult::Page(_) => panic!("expected Drained, got Page"), + PageResult::SplitRequired { .. } => panic!("expected Drained, got SplitRequired"), + } +} + +/// Creates a test `CosmosOperation`. +pub(crate) fn operation() -> CosmosOperation { + let account = AccountReference::with_master_key( + url::Url::parse("https://test.documents.azure.com:443/").unwrap(), + "dGVzdA==", + ); + let database = DatabaseReference::from_name(account, "db".to_owned()); + CosmosOperation::read_database(database) +} + +/// Creates a `RequestTarget` for a logical partition key. +pub(crate) fn logical_partition_target() -> RequestTarget { + RequestTarget::LogicalPartitionKey(PartitionKey::from("pk")) +} + +/// Creates a `RequestTarget` for an EPK range ("" to "80", partition key range ID "0"). +pub(crate) fn epk_range_target() -> RequestTarget { + RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::min(), + EffectivePartitionKey::from("80"), + ), + partition_key_range_id: "0".to_string(), + } +} + +/// Creates a test response with the given body. +pub(crate) fn response(body: &[u8]) -> CosmosResponse { + response_with_continuation(body, None) +} + +/// Creates a test response with the given body and optional continuation token. +pub(crate) fn response_with_continuation( + body: &[u8], + continuation: Option<&str>, +) -> CosmosResponse { + let mut diagnostics = DiagnosticsContextBuilder::new( + ActivityId::new_uuid(), + Arc::new(DiagnosticsOptions::default()), + ); + diagnostics.set_operation_status(StatusCode::Ok, None); + let mut headers = CosmosResponseHeaders::new(); + headers.continuation = continuation.map(str::to_owned); + CosmosResponse::new( + body.to_vec(), + headers, + CosmosStatus::new(StatusCode::Ok), + Arc::new(diagnostics.complete()), + ) +} + +/// Creates a 410 Gone error with a partition topology change substatus. +pub(crate) fn gone_error() -> azure_core::Error { + azure_core::Error::new( + azure_core::error::ErrorKind::HttpResponse { + status: StatusCode::Gone, + error_code: Some(SubStatusCode::PARTITION_KEY_RANGE_GONE.value().to_string()), + raw_response: None, + }, + "partition topology changed", + ) +} + +/// Creates a 410 Gone error with a non-topology substatus. +pub(crate) fn non_topology_gone_error() -> azure_core::Error { + azure_core::Error::new( + azure_core::error::ErrorKind::HttpResponse { + status: StatusCode::Gone, + error_code: Some(SubStatusCode::NAME_CACHE_STALE.value().to_string()), + raw_response: None, + }, + "name cache is stale", + ) +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs index 820e5c5264d..0ad72efcc34 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -3,14 +3,20 @@ //! Dataflow pipeline nodes for paged Cosmos DB operations. +mod drain; +#[cfg(test)] +pub(crate) mod mocks; pub(crate) mod planner; mod request; +mod topology; use futures::future::BoxFuture; -use crate::models::{CosmosOperation, CosmosResponse}; +use crate::models::{CosmosOperation, CosmosResponse, FeedRange}; +pub(crate) use drain::SequentialDrain; pub(crate) use request::{Request, RequestTarget}; +pub(crate) use topology::CachedTopologyProvider; /// Request execution mode for partition routing metadata. #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -33,15 +39,46 @@ pub(crate) trait RequestExecutor: Send { ) -> BoxFuture<'a, azure_core::Result>; } +/// Resolves EPK ranges to their current physical partition key ranges. +/// +/// Used by pipeline nodes to recover from partition topology changes (splits). +/// The `PartitionKeyRangeCache` implements this trait in production. +pub(crate) trait TopologyProvider: Send { + /// Resolves the physical partitions that currently cover the given EPK range. + /// + /// Returns partition key range IDs paired with their EPK sub-ranges, ordered + /// by EPK from smallest to largest. + fn resolve_ranges<'a>( + &'a mut self, + range: &'a FeedRange, + ) -> BoxFuture<'a, azure_core::Result>>; +} + +/// A physical partition's EPK sub-range, as resolved from the current topology. +#[derive(Debug, Clone)] +pub(crate) struct ResolvedRange { + /// The partition key range ID for this physical partition. + pub partition_key_range_id: String, + /// The EPK sub-range within this physical partition. + pub range: FeedRange, +} + /// Context passed through dataflow node execution. pub(crate) struct PipelineContext<'a> { request_executor: &'a mut dyn RequestExecutor, + topology_provider: &'a mut dyn TopologyProvider, } impl<'a> PipelineContext<'a> { /// Creates a new pipeline execution context. - pub(crate) fn new(request_executor: &'a mut dyn RequestExecutor) -> Self { - Self { request_executor } + pub(crate) fn new( + request_executor: &'a mut dyn RequestExecutor, + topology_provider: &'a mut dyn TopologyProvider, + ) -> Self { + Self { + request_executor, + topology_provider, + } } async fn execute_request( @@ -55,6 +92,92 @@ impl<'a> PipelineContext<'a> { .execute_request(operation, target, partition_routing_refresh, continuation) .await } + + async fn resolve_ranges( + &mut self, + range: &FeedRange, + ) -> azure_core::Result> { + self.topology_provider.resolve_ranges(range).await + } +} + +/// Result of a single `next_page` call on a pipeline node. +/// +/// The `Page` variant contains a large `CosmosResponse` inline, but boxing it +/// would add a heap allocation on every page fetch — the hot path. The `SplitRequired` +/// variant is rare (only on partition splits), so the size difference is acceptable. +#[must_use = "a PageResult carries the next page, drain signal, or a split request that the caller must act on"] +#[allow(clippy::large_enum_variant)] +pub(crate) enum PageResult { + /// A page of results was produced. + Page(CosmosResponse), + /// This node has no more pages to emit. + Drained, + /// This node's EPK range has split and needs to be replaced by new child nodes. + /// + /// It is the parent intermediate node's responsibility to splice + /// `replacement_nodes` into its children list (in place of the child that + /// emitted this result) and re-attempt draining from the first replacement. + /// If a node returns `SplitRequired` to a parent that does not handle + /// splits (e.g. the pipeline root), the operation fails. + SplitRequired { + /// New child nodes covering the sub-ranges of the split partition. + replacement_nodes: Vec>, + }, +} + +impl std::fmt::Debug for PageResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PageResult::Page(_) => f.write_str("Page(...)"), + PageResult::Drained => f.write_str("Drained"), + PageResult::SplitRequired { + replacement_nodes, .. + } => write!(f, "SplitRequired({} nodes)", replacement_nodes.len()), + } + } +} + +/// An iterator over child pipeline nodes. +/// +/// Used by [`PipelineNode::children`] to expose children for diagnostics +/// without requiring a contiguous slice, which `VecDeque`-backed nodes +/// cannot always provide. +pub(crate) enum ChildNodes<'a> { + /// No children (leaf nodes). + None, + /// Children stored in a contiguous slice (e.g. a `Vec`). + Slice(&'a [Box]), + /// Children stored in a `VecDeque`, exposed as two contiguous slices. + Split(&'a [Box], &'a [Box]), +} + +impl<'a> ChildNodes<'a> { + /// Returns the total number of children. + pub fn len(&self) -> usize { + match self { + ChildNodes::None => 0, + ChildNodes::Slice(s) => s.len(), + ChildNodes::Split(a, b) => a.len() + b.len(), + } + } +} + +impl<'a> IntoIterator for ChildNodes<'a> { + type Item = &'a Box; + type IntoIter = std::iter::Chain< + std::slice::Iter<'a, Box>, + std::slice::Iter<'a, Box>, + >; + + fn into_iter(self) -> Self::IntoIter { + let empty: &[Box] = &[]; + match self { + ChildNodes::None => empty.iter().chain(empty.iter()), + ChildNodes::Slice(s) => s.iter().chain(empty.iter()), + ChildNodes::Split(a, b) => a.iter().chain(b.iter()), + } + } } /// A dataflow node that emits pages and may own child nodes. @@ -64,14 +187,14 @@ impl<'a> PipelineContext<'a> { /// of a Cosmos DB request. #[async_trait::async_trait] pub(crate) trait PipelineNode: Send + std::any::Any { - /// Emits the next page of results, or `None` when this node is drained. + /// Emits the next page of results, signals drain completion, or requests a split. async fn next_page( &mut self, context: &mut PipelineContext<'_>, - ) -> azure_core::Result>; + ) -> azure_core::Result; - /// Returns the node's strongly-owned children. - fn children(&self) -> &[Box]; + /// Returns the node's children for diagnostic inspection. + fn children(&self) -> ChildNodes<'_>; } impl dyn PipelineNode { @@ -98,93 +221,40 @@ impl Pipeline { } /// Emits the next page from the root node. + /// + /// Returns `Ok(Some(response))` for a page, `Ok(None)` when drained. pub(crate) async fn next_page( &mut self, context: &mut PipelineContext<'_>, ) -> azure_core::Result> { - self.root.next_page(context).await + match self.root.next_page(context).await? { + PageResult::Page(response) => Ok(Some(response)), + PageResult::Drained => Ok(None), + // Defensive: today the root is always a `Request`, `SequentialDrain`, + // or `DrainedLeaf`, none of which can bubble `SplitRequired` up past + // their parent. If a future node type ever does, surfacing it as an + // explicit error is preferable to silently dropping the page. + PageResult::SplitRequired { .. } => Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "root node cannot request a split; splits must be handled by a parent node", + )), + } } } #[cfg(test)] mod tests { - use std::{collections::VecDeque, sync::Arc}; - - use futures::future::BoxFuture; - + use super::mocks::*; use super::*; - use crate::{ - diagnostics::DiagnosticsContextBuilder, - models::{ActivityId, CosmosResponseHeaders, CosmosStatus}, - options::DiagnosticsOptions, - }; - - struct MockLeaf { - pages: VecDeque>>, - } - - impl MockLeaf { - fn with_pages(pages: Vec>>) -> Self { - Self { - pages: pages.into(), - } - } - } - - #[async_trait::async_trait] - impl PipelineNode for MockLeaf { - async fn next_page( - &mut self, - _context: &mut PipelineContext<'_>, - ) -> azure_core::Result> { - self.pages.pop_front().expect("mock page result") - } - - fn children(&self) -> &[Box] { - &[] - } - } - - struct NoopRequestExecutor; - - impl RequestExecutor for NoopRequestExecutor { - fn execute_request<'a>( - &'a mut self, - _operation: &'a CosmosOperation, - _target: RequestTarget, - _partition_routing_refresh: PartitionRoutingRefresh, - _continuation: Option, - ) -> BoxFuture<'a, azure_core::Result> { - Box::pin(async { - Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "noop executor should not be called", - )) - }) - } - } - - fn response(body: &[u8]) -> CosmosResponse { - let mut diagnostics = DiagnosticsContextBuilder::new( - ActivityId::new_uuid(), - Arc::new(DiagnosticsOptions::default()), - ); - diagnostics.set_operation_status(azure_core::http::StatusCode::Ok, None); - CosmosResponse::new( - body.to_vec(), - CosmosResponseHeaders::new(), - CosmosStatus::new(azure_core::http::StatusCode::Ok), - Arc::new(diagnostics.complete()), - ) - } #[tokio::test] async fn pipeline_forwards_pages_from_root() { - let mut pipeline = Pipeline::new(Box::new(MockLeaf::with_pages(vec![Ok(Some(response( - b"page", - )))]))); + let mut pipeline = Pipeline::new(Box::new(MockLeaf::with_pages(vec![Ok( + PageResult::Page(response(b"page")), + )]))); let mut executor = NoopRequestExecutor; - let mut context = PipelineContext::new(&mut executor); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); let page = pipeline.next_page(&mut context).await.unwrap().unwrap(); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index 3618e8547b8..6f27316ef02 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -8,7 +8,7 @@ use azure_core::http::StatusCode; use crate::models::{CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode}; -use super::{PartitionRoutingRefresh, PipelineContext, PipelineNode}; +use super::{ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode}; /// The target of a request node. #[derive(Debug, Clone, PartialEq, Eq)] @@ -28,6 +28,18 @@ pub(crate) enum RequestTarget { }, } +impl RequestTarget { + /// Returns `true` if this target's EPK range starts at the same point as `parent_range`. + fn covers_start_of(&self, parent_range: &FeedRange) -> bool { + match self { + RequestTarget::EffectivePartitionKeyRange { range, .. } => { + range.min_inclusive() == parent_range.min_inclusive() + } + _ => false, + } + } +} + /// Leaf node that executes one Cosmos DB request per page. pub(crate) struct Request { operation: CosmosOperation, @@ -77,7 +89,7 @@ impl PipelineNode for Request { async fn next_page( &mut self, context: &mut PipelineContext<'_>, - ) -> azure_core::Result> { + ) -> azure_core::Result { match context .execute_request( &self.operation, @@ -87,7 +99,9 @@ impl PipelineNode for Request { ) .await { - Ok(response) => Ok(Some(self.record_response_continuation(response))), + Ok(response) => Ok(PageResult::Page( + self.record_response_continuation(response), + )), Err(error) if is_partition_topology_change(&error) => { self.handle_partition_topology_change(context, error).await } @@ -95,8 +109,8 @@ impl PipelineNode for Request { } } - fn children(&self) -> &[Box] { - &[] + fn children(&self) -> ChildNodes<'_> { + ChildNodes::None } } impl Request { @@ -104,7 +118,7 @@ impl Request { &mut self, context: &mut PipelineContext<'_>, error: azure_core::Error, - ) -> azure_core::Result> { + ) -> azure_core::Result { match &self.target { RequestTarget::NonPartitioned => { // Non-partitioned resources don't have partition topology changes. @@ -115,6 +129,11 @@ impl Request { return Err(error); } + // This shouldn't really happen, but it's been observed. + // Since the original request had a logical partition key, + // the gateway should have been able to route the request + // to the correct partition even if it has split. + // But we can do a single retry without forcing a topology refresh to see if it succeeds. self.logical_partition_topology_retry_used = true; context .execute_request( @@ -124,23 +143,61 @@ impl Request { self.latest_server_continuation.clone(), ) .await - .map(|response| self.record_response_continuation(response)) - .map(Some) + .map(|response| PageResult::Page(self.record_response_continuation(response))) } - RequestTarget::EffectivePartitionKeyRange { .. } => { - panic!( - "EPK range request encountered a partition topology change; pipeline repair is not implemented" - ); + RequestTarget::EffectivePartitionKeyRange { range, .. } => { + let range = range.clone(); + self.split_for_topology_change(context, &range).await } } } + /// Resolves the current topology for this node's EPK range and returns + /// a `SplitRequired` result with replacement nodes for each sub-range. + async fn split_for_topology_change( + &self, + context: &mut PipelineContext<'_>, + range: &FeedRange, + ) -> azure_core::Result { + let resolved = context.resolve_ranges(range).await?; + + let replacement_nodes: Vec> = resolved + .into_iter() + .map(|resolved_range| { + let target = RequestTarget::EffectivePartitionKeyRange { + range: resolved_range.range, + partition_key_range_id: resolved_range.partition_key_range_id, + }; + // Carry over the server continuation to the first replacement that + // covers the same starting EPK. For a split, only the left-most child + // inherits the continuation since it resumes where this node left off. + // TODO: When we support streaming ordered merges, we'll need to augment this a bit. + let continuation = if target.covers_start_of(range) { + self.latest_server_continuation.clone() + } else { + None + }; + Box::new(Request::with_continuation( + self.operation.clone(), + target, + continuation, + )) as Box + }) + .collect(); + + Ok(PageResult::SplitRequired { replacement_nodes }) + } + fn record_response_continuation(&mut self, response: CosmosResponse) -> CosmosResponse { self.latest_server_continuation = response.headers().continuation.clone(); response } } +// Partition topology changes are a specific subset of `Gone` substatus codes. +// Other substatus mappings live in `pipeline::retry_evaluation`; this one stays +// here because it drives pipeline-level repair (splitting a node into +// replacements) rather than per-attempt retry. fn is_partition_topology_change(error: &azure_core::Error) -> bool { match error.kind() { azure_core::error::ErrorKind::HttpResponse { @@ -164,125 +221,18 @@ fn is_partition_topology_change_substatus(substatus: u32) -> bool { #[cfg(test)] mod tests { - use std::{collections::VecDeque, sync::Arc}; - - use azure_core::error::ErrorKind; - use futures::future::BoxFuture; - use super::*; - use crate::{ - diagnostics::DiagnosticsContextBuilder, - driver::dataflow::RequestExecutor, - models::{ - effective_partition_key::EffectivePartitionKey, AccountReference, ActivityId, - CosmosResponseHeaders, CosmosStatus, DatabaseReference, - }, - options::DiagnosticsOptions, - }; - - struct MockRequestExecutor { - responses: VecDeque>, - refresh_calls: Vec, - continuation_calls: Vec>, - } - - impl MockRequestExecutor { - fn new(responses: Vec>) -> Self { - Self { - responses: responses.into(), - refresh_calls: Vec::new(), - continuation_calls: Vec::new(), - } - } - } - - impl RequestExecutor for MockRequestExecutor { - fn execute_request<'a>( - &'a mut self, - _operation: &'a CosmosOperation, - _target: RequestTarget, - partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option, - ) -> BoxFuture<'a, azure_core::Result> { - self.refresh_calls.push(partition_routing_refresh); - self.continuation_calls.push(continuation); - let response = self.responses.pop_front().expect("mock request response"); - Box::pin(async move { response }) - } - } - - fn operation() -> CosmosOperation { - let account = AccountReference::with_master_key( - url::Url::parse("https://test.documents.azure.com:443/").unwrap(), - "dGVzdA==", - ); - let database = DatabaseReference::from_name(account, "db".to_owned()); - CosmosOperation::read_database(database) - } - - fn logical_partition_target() -> RequestTarget { - RequestTarget::LogicalPartitionKey(PartitionKey::from("pk")) - } - - fn epk_range_target() -> RequestTarget { - RequestTarget::EffectivePartitionKeyRange { - range: FeedRange::new( - EffectivePartitionKey::min(), - EffectivePartitionKey::from("80"), - ), - partition_key_range_id: "0".to_string(), - } - } - - fn response(body: &[u8]) -> CosmosResponse { - response_with_continuation(body, None) - } - - fn response_with_continuation(body: &[u8], continuation: Option<&str>) -> CosmosResponse { - let mut diagnostics = DiagnosticsContextBuilder::new( - ActivityId::new_uuid(), - Arc::new(DiagnosticsOptions::default()), - ); - diagnostics.set_operation_status(StatusCode::Ok, None); - let mut headers = CosmosResponseHeaders::new(); - headers.continuation = continuation.map(str::to_owned); - CosmosResponse::new( - body.to_vec(), - headers, - CosmosStatus::new(StatusCode::Ok), - Arc::new(diagnostics.complete()), - ) - } - - fn gone_error() -> azure_core::Error { - azure_core::Error::new( - ErrorKind::HttpResponse { - status: StatusCode::Gone, - error_code: Some(SubStatusCode::PARTITION_KEY_RANGE_GONE.value().to_string()), - raw_response: None, - }, - "partition topology changed", - ) - } - - fn non_topology_gone_error() -> azure_core::Error { - azure_core::Error::new( - ErrorKind::HttpResponse { - status: StatusCode::Gone, - error_code: Some(SubStatusCode::NAME_CACHE_STALE.value().to_string()), - raw_response: None, - }, - "name cache is stale", - ) - } + use crate::driver::dataflow::{mocks::*, ResolvedRange}; + use crate::models::{effective_partition_key::EffectivePartitionKey, FeedRange}; #[tokio::test] async fn request_retries_logical_partition_key_topology_change_once() { let mut request = Request::new(operation(), logical_partition_target()); let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Ok(response(b"ok"))]); - let mut context = PipelineContext::new(&mut executor); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); - let page = request.next_page(&mut context).await.unwrap().unwrap(); + let page = unwrap_page(request.next_page(&mut context).await); assert_eq!(page.body(), b"ok"); assert_eq!( @@ -299,7 +249,8 @@ mod tests { async fn request_returns_second_logical_partition_key_topology_change() { let mut request = Request::new(operation(), logical_partition_target()); let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Err(gone_error())]); - let mut context = PipelineContext::new(&mut executor); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); let error = request.next_page(&mut context).await.unwrap_err(); @@ -318,7 +269,8 @@ mod tests { async fn request_does_not_retry_non_topology_gone() { let mut request = Request::new(operation(), logical_partition_target()); let mut executor = MockRequestExecutor::new(vec![Err(non_topology_gone_error())]); - let mut context = PipelineContext::new(&mut executor); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); let error = request.next_page(&mut context).await.unwrap_err(); @@ -337,10 +289,11 @@ mod tests { Ok(response_with_continuation(b"page1", Some("token-1"))), Ok(response_with_continuation(b"page2", Some("token-2"))), ]); - let mut context = PipelineContext::new(&mut executor); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); - let page1 = request.next_page(&mut context).await.unwrap().unwrap(); - let page2 = request.next_page(&mut context).await.unwrap().unwrap(); + let page1 = unwrap_page(request.next_page(&mut context).await); + let page2 = unwrap_page(request.next_page(&mut context).await); assert_eq!(page1.body(), b"page1"); assert_eq!(page2.body(), b"page2"); @@ -359,9 +312,10 @@ mod tests { Some("restored-token".to_string()), ); let mut executor = MockRequestExecutor::new(vec![Ok(response(b"page"))]); - let mut context = PipelineContext::new(&mut executor); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); - let page = request.next_page(&mut context).await.unwrap().unwrap(); + let page = unwrap_page(request.next_page(&mut context).await); assert_eq!(page.body(), b"page"); assert_eq!( @@ -370,4 +324,188 @@ mod tests { ); assert_eq!(request.latest_server_continuation(), None); } + + // ── Split recovery tests ────────────────────────────────────────────── + + #[tokio::test] + async fn epk_range_topology_change_returns_split_required() { + let mut request = Request::new(operation(), epk_range_target()); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + ResolvedRange { + partition_key_range_id: "1".to_string(), + range: FeedRange::new( + EffectivePartitionKey::min(), + EffectivePartitionKey::from("40"), + ), + }, + ResolvedRange { + partition_key_range_id: "2".to_string(), + range: FeedRange::new( + EffectivePartitionKey::from("40"), + EffectivePartitionKey::from("80"), + ), + }, + ])]); + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let result = request.next_page(&mut context).await.unwrap(); + match result { + PageResult::SplitRequired { replacement_nodes } => { + assert_eq!(replacement_nodes.len(), 2); + + let r0 = replacement_nodes[0].downcast_ref::().unwrap(); + assert_eq!( + r0.target(), + &RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::min(), + EffectivePartitionKey::from("40"), + ), + partition_key_range_id: "1".to_string(), + } + ); + + let r1 = replacement_nodes[1].downcast_ref::().unwrap(); + assert_eq!( + r1.target(), + &RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::from("40"), + EffectivePartitionKey::from("80"), + ), + partition_key_range_id: "2".to_string(), + } + ); + } + other => panic!("expected SplitRequired, got {:?}", other), + } + } + + #[tokio::test] + async fn split_left_child_inherits_continuation() { + let mut request = Request::with_continuation( + operation(), + epk_range_target(), + Some("server-token".to_string()), + ); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + ResolvedRange { + partition_key_range_id: "1".to_string(), + range: FeedRange::new( + EffectivePartitionKey::min(), + EffectivePartitionKey::from("40"), + ), + }, + ResolvedRange { + partition_key_range_id: "2".to_string(), + range: FeedRange::new( + EffectivePartitionKey::from("40"), + EffectivePartitionKey::from("80"), + ), + }, + ])]); + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let result = request.next_page(&mut context).await.unwrap(); + match result { + PageResult::SplitRequired { replacement_nodes } => { + let left = replacement_nodes[0].downcast_ref::().unwrap(); + assert_eq!( + left.latest_server_continuation(), + Some("server-token"), + "left-most child should inherit the server continuation" + ); + + let right = replacement_nodes[1].downcast_ref::().unwrap(); + assert_eq!( + right.latest_server_continuation(), + None, + "non-left children should have no continuation" + ); + } + other => panic!("expected SplitRequired, got {:?}", other), + } + } + + #[tokio::test] + async fn split_three_way_only_left_inherits_continuation() { + let range = FeedRange::new( + EffectivePartitionKey::from("10"), + EffectivePartitionKey::from("90"), + ); + let mut request = Request::with_continuation( + operation(), + RequestTarget::EffectivePartitionKeyRange { + range: range.clone(), + partition_key_range_id: "0".to_string(), + }, + Some("ct".to_string()), + ); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + ResolvedRange { + partition_key_range_id: "1".to_string(), + range: FeedRange::new( + EffectivePartitionKey::from("10"), + EffectivePartitionKey::from("40"), + ), + }, + ResolvedRange { + partition_key_range_id: "2".to_string(), + range: FeedRange::new( + EffectivePartitionKey::from("40"), + EffectivePartitionKey::from("70"), + ), + }, + ResolvedRange { + partition_key_range_id: "3".to_string(), + range: FeedRange::new( + EffectivePartitionKey::from("70"), + EffectivePartitionKey::from("90"), + ), + }, + ])]); + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let result = request.next_page(&mut context).await.unwrap(); + match result { + PageResult::SplitRequired { replacement_nodes } => { + assert_eq!(replacement_nodes.len(), 3); + let left = replacement_nodes[0].downcast_ref::().unwrap(); + assert_eq!(left.latest_server_continuation(), Some("ct")); + let mid = replacement_nodes[1].downcast_ref::().unwrap(); + assert_eq!(mid.latest_server_continuation(), None); + let right = replacement_nodes[2].downcast_ref::().unwrap(); + assert_eq!(right.latest_server_continuation(), None); + } + other => panic!("expected SplitRequired, got {:?}", other), + } + } + + #[tokio::test] + async fn topology_provider_error_propagates() { + let mut request = Request::new(operation(), epk_range_target()); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); + let mut topology = MockTopologyProvider::new(vec![Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "topology fetch failed", + ))]); + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let err = request.next_page(&mut context).await.unwrap_err(); + assert!(err.to_string().contains("topology fetch failed")); + } + + #[tokio::test] + async fn non_partitioned_topology_change_not_retried() { + let mut request = Request::new(operation(), RequestTarget::NonPartitioned); + let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let err = request.next_page(&mut context).await.unwrap_err(); + assert!(is_partition_topology_change(&err)); + } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs new file mode 100644 index 00000000000..1a47b40f8c1 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs @@ -0,0 +1,272 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Topology provider adapter backed by the partition key range cache. + +use futures::future::BoxFuture; + +use crate::{ + driver::cache::{PartitionKeyRangeCache, PkRangeFetchResult}, + models::{ContainerReference, FeedRange}, +}; + +use super::{ResolvedRange, TopologyProvider}; + +/// Adapts [`PartitionKeyRangeCache`] to the [`TopologyProvider`] trait. +/// +/// Holds a reference to the cache, the container being queried, and a function +/// that fetches partition key ranges from the service. On each +/// [`resolve_ranges`](TopologyProvider::resolve_ranges) call, it force-refreshes +/// the cache (since splits are the reason we're resolving) and converts the +/// resulting `PartitionKeyRange` objects to [`ResolvedRange`] values. +/// +/// # Type parameters +/// +/// * `F` — `Fn(ContainerReference, Option) -> Fut` that fetches +/// pk-ranges from the service. Passed by reference to the cache so the +/// adapter can call it repeatedly without requiring `Clone`. +pub(crate) struct CachedTopologyProvider<'a, F> { + cache: &'a PartitionKeyRangeCache, + container: ContainerReference, + fetch_pk_ranges: F, +} + +impl<'a, F> CachedTopologyProvider<'a, F> { + /// Creates a topology provider backed by the partition key range cache. + pub(crate) fn new( + cache: &'a PartitionKeyRangeCache, + container: ContainerReference, + fetch_pk_ranges: F, + ) -> Self { + Self { + cache, + container, + fetch_pk_ranges, + } + } +} + +impl TopologyProvider for CachedTopologyProvider<'_, F> +where + F: Fn(ContainerReference, Option) -> Fut + Send + Sync, + Fut: std::future::Future> + Send, +{ + fn resolve_ranges<'a>( + &'a mut self, + range: &'a FeedRange, + ) -> BoxFuture<'a, azure_core::Result>> { + Box::pin(async move { + // Force-refresh because we're recovering from a topology change (split). + let pk_ranges = self + .cache + .resolve_overlapping_ranges( + &self.container, + range.min_inclusive()..range.max_exclusive(), + true, + &self.fetch_pk_ranges, + ) + .await; + + let pk_ranges = match pk_ranges { + Some(ranges) if !ranges.is_empty() => ranges, + _ => { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "failed to resolve partition key ranges from topology cache", + )); + } + }; + + Ok(pk_ranges + .into_iter() + .map(|pkr| ResolvedRange { + partition_key_range_id: pkr.id, + range: FeedRange::new(pkr.min_inclusive, pkr.max_exclusive), + }) + .collect()) + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::models::{ + effective_partition_key::EffectivePartitionKey, + partition_key_range::PartitionKeyRange as PkRange, ContainerProperties, + }; + + fn make_container() -> ContainerReference { + let account = crate::models::AccountReference::with_master_key( + url::Url::parse("https://test.documents.azure.com:443/").unwrap(), + "dGVzdA==", + ); + let props = ContainerProperties { + id: "c".into(), + partition_key: serde_json::from_str(r#"{"paths":["/pk"],"version":2}"#).unwrap(), + system_properties: Default::default(), + }; + ContainerReference::new(account, "db", "db_rid", "c", "c_rid", &props) + } + + async fn single_range_fetch( + _container: ContainerReference, + continuation: Option, + ) -> Option { + if continuation.is_some() { + Some(PkRangeFetchResult { + ranges: vec![], + continuation, + not_modified: true, + }) + } else { + Some(PkRangeFetchResult { + ranges: vec![PkRange::new("0".into(), "", "FF")], + continuation: Some("etag-1".to_string()), + not_modified: false, + }) + } + } + + async fn two_range_fetch( + _container: ContainerReference, + continuation: Option, + ) -> Option { + if continuation.is_some() { + Some(PkRangeFetchResult { + ranges: vec![], + continuation, + not_modified: true, + }) + } else { + Some(PkRangeFetchResult { + ranges: vec![ + PkRange::new("1".into(), "", "80"), + PkRange::new("2".into(), "80", "FF"), + ], + continuation: Some("etag-2".to_string()), + not_modified: false, + }) + } + } + + async fn three_range_fetch( + _container: ContainerReference, + continuation: Option, + ) -> Option { + if continuation.is_some() { + Some(PkRangeFetchResult { + ranges: vec![], + continuation, + not_modified: true, + }) + } else { + Some(PkRangeFetchResult { + ranges: vec![ + PkRange::new("1".into(), "", "40"), + PkRange::new("2".into(), "40", "80"), + PkRange::new("3".into(), "80", "FF"), + ], + continuation: Some("etag-3".to_string()), + not_modified: false, + }) + } + } + + async fn failing_fetch( + _container: ContainerReference, + _continuation: Option, + ) -> Option { + None + } + + #[tokio::test] + async fn resolves_single_range_for_full_epk_space() { + let cache = PartitionKeyRangeCache::new(); + let mut provider = + CachedTopologyProvider::new(&cache, make_container(), single_range_fetch); + + let ranges = provider.resolve_ranges(&FeedRange::full()).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!(ranges[0].partition_key_range_id, "0"); + assert_eq!( + ranges[0].range.min_inclusive(), + &EffectivePartitionKey::min() + ); + assert_eq!( + ranges[0].range.max_exclusive(), + &EffectivePartitionKey::max() + ); + } + + #[tokio::test] + async fn resolves_split_ranges() { + let cache = PartitionKeyRangeCache::new(); + let mut provider = CachedTopologyProvider::new(&cache, make_container(), two_range_fetch); + + let ranges = provider.resolve_ranges(&FeedRange::full()).await.unwrap(); + + assert_eq!(ranges.len(), 2); + assert_eq!(ranges[0].partition_key_range_id, "1"); + assert_eq!( + ranges[0].range.min_inclusive(), + &EffectivePartitionKey::min() + ); + assert_eq!( + ranges[0].range.max_exclusive(), + &EffectivePartitionKey::from("80") + ); + assert_eq!(ranges[1].partition_key_range_id, "2"); + assert_eq!( + ranges[1].range.min_inclusive(), + &EffectivePartitionKey::from("80") + ); + assert_eq!( + ranges[1].range.max_exclusive(), + &EffectivePartitionKey::max() + ); + } + + #[tokio::test] + async fn resolves_partial_epk_range() { + let cache = PartitionKeyRangeCache::new(); + let mut provider = CachedTopologyProvider::new(&cache, make_container(), two_range_fetch); + + let left_half = FeedRange::new( + EffectivePartitionKey::min(), + EffectivePartitionKey::from("80"), + ); + let ranges = provider.resolve_ranges(&left_half).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!(ranges[0].partition_key_range_id, "1"); + } + + #[tokio::test] + async fn resolves_three_way_split() { + let cache = PartitionKeyRangeCache::new(); + let mut provider = CachedTopologyProvider::new(&cache, make_container(), three_range_fetch); + + let ranges = provider.resolve_ranges(&FeedRange::full()).await.unwrap(); + + assert_eq!(ranges.len(), 3); + assert_eq!(ranges[0].partition_key_range_id, "1"); + assert_eq!(ranges[1].partition_key_range_id, "2"); + assert_eq!(ranges[2].partition_key_range_id, "3"); + } + + #[tokio::test] + async fn returns_error_when_fetch_fails() { + let cache = PartitionKeyRangeCache::new(); + let mut provider = CachedTopologyProvider::new(&cache, make_container(), failing_fetch); + + let err = provider + .resolve_ranges(&FeedRange::full()) + .await + .unwrap_err(); + assert!(err + .to_string() + .contains("failed to resolve partition key ranges")); + } +} From ad0969b18ec3668a2ec10f234db8c979763a4f22 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 17:41:47 -0700 Subject: [PATCH 18/29] Add query plan models and plan_operation API Add backend query plan model types (QueryPlan, QueryInfo, QueryRange, etc.) for parsing gateway query plan responses. Implement build_pipeline in planner.rs that validates query plans (rejects unsupported features like top/limit/ordering/ aggregates/hybrid search) and constructs fan-out pipelines from backend-provided query ranges. Add plan_operation on CosmosDriver that handles trivial plans (non-query or single-partition) with singleton pipelines, and cross-partition queries by fetching a query plan from the backend then building a fan-out pipeline via build_pipeline. Supporting changes: - Add supported_query_features field to CosmosRequestHeaders - Add force_refresh parameter to CachedTopologyProvider - Add PartitionKeyRangeCache to CosmosDriver - Add fetch_partition_key_ranges and read_partition_key_ranges - Add OperationPlan public newtype wrapping Pipeline - Add Debug impl for Pipeline Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/clients/container_client.rs | 10 +- .../src/driver/cosmos_driver.rs | 109 +++- .../src/driver/dataflow/drain.rs | 13 +- .../src/driver/dataflow/mocks.rs | 6 + .../src/driver/dataflow/mod.rs | 66 ++- .../src/driver/dataflow/planner.rs | 545 +++++++++++++++++- .../src/driver/dataflow/query_plan.rs | 263 +++++++++ .../src/driver/dataflow/request.rs | 10 +- .../src/driver/dataflow/topology.rs | 42 +- .../src/driver/mod.rs | 1 + .../azure_data_cosmos_driver/src/lib.rs | 2 +- .../src/models/cosmos_headers.rs | 19 + .../src/models/cosmos_operation.rs | 126 +++- .../src/models/mod.rs | 8 + .../src/models/operation_target.rs | 7 + 15 files changed, 1146 insertions(+), 81 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 2d6db39a537..68f05183993 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -23,7 +23,7 @@ use crate::handler::container_connection::ContainerConnection; use crate::routing::partition_key_range_cache::PartitionKeyRangeCache; use azure_data_cosmos_driver::models::{ effective_partition_key::EffectivePartitionKey as DriverEpk, ContainerReference, - CosmosOperation, ItemReference, PartitionKeyKind, + CosmosOperation, ItemReference, OperationTarget, PartitionKeyKind, }; use azure_data_cosmos_driver::options::OperationOptions; use serde::{de::DeserializeOwned, Serialize}; @@ -740,8 +740,12 @@ impl ContainerClient { let driver_pk = partition_key.into_driver_partition_key(); let container_ref = self.container_ref.clone(); - let factory = - move || CosmosOperation::query_items(container_ref.clone(), driver_pk.clone()); + let factory = move || { + CosmosOperation::query_items( + container_ref.clone(), + OperationTarget::PartitionKey(driver_pk.clone()), + ) + }; crate::query::executor::QueryExecutor::new( self.context.driver.clone(), diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index a4589002a8a..d3bbc1d18c6 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -8,8 +8,10 @@ use crate::{ DiagnosticsContextBuilder, PipelineType, TransportHttpVersion, TransportSecurity, }, driver::{ + cache::{PartitionKeyRangeCache, PkRangeFetchResult}, dataflow::{ - planner, PartitionRoutingRefresh, PipelineContext, RequestExecutor, RequestTarget, + planner, query_plan::QueryPlan, CachedTopologyProvider, OperationPlan, + PartitionRoutingRefresh, PipelineContext, RequestExecutor, RequestTarget, ResolvedRange, TopologyProvider, }, pipeline::operation_pipeline::OperationOverrides, @@ -95,6 +97,7 @@ impl TopologyProvider for StubTopologyProvider { fn resolve_ranges<'a>( &'a mut self, _range: &'a crate::models::FeedRange, + _refresh: super::dataflow::PartitionRoutingRefresh, ) -> BoxFuture<'a, azure_core::Result>> { Box::pin(async { Err(azure_core::Error::with_message( @@ -134,6 +137,8 @@ pub struct CosmosDriver { /// initialization. In normal usage `get_or_create_driver` awaits `initialize()` /// before returning, so this guard only catches misuse. initialized: AtomicBool, + /// Cache for partition key ranges, used for topology resolution during planning. + pk_range_cache: PartitionKeyRangeCache, } impl CosmosDriver { @@ -833,6 +838,7 @@ impl CosmosDriver { location_state_store, session_manager: SessionManager::new(), initialized: AtomicBool::new(false), + pk_range_cache: PartitionKeyRangeCache::new(), } } @@ -1043,7 +1049,7 @@ impl CosmosDriver { } tracing::debug!("operation started"); - let mut pipeline = planner::plan_pipeline(&operation)?; + let mut pipeline = planner::build_trivial_pipeline(&operation)?; let mut executor = DriverRequestExecutor { driver: self, @@ -1263,6 +1269,105 @@ impl CosmosDriver { Ok(resolved.as_ref().clone()) } + + /// Plans the execution of a Cosmos DB operation. + /// + /// For trivial operations (non-query or single-partition), returns a + /// singleton pipeline immediately. For cross-partition queries, fetches a + /// query plan from the backend and builds a fan-out pipeline. + pub async fn plan_operation( + &self, + operation: CosmosOperation, + options: &OperationOptions, + ) -> azure_core::Result { + // Trivial plan: anything that isn't a cross-partition query. + if operation.is_trivial() { + let pipeline = planner::build_trivial_pipeline(&operation)?; + return Ok(OperationPlan::new(pipeline)); + } + + // Cross-partition query: fetch query plan from backend. + let container = operation.container().ok_or_else(|| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "cross-partition query requires a container reference", + ) + })?; + + let query_plan_operation = CosmosOperation::query_plan(container.clone()) + .with_body(operation.body().unwrap_or_default().to_vec()); + + let response = self + .execute_operation_direct( + &query_plan_operation, + OperationOverrides::default(), + options, + ) + .await?; + + let query_plan: QueryPlan = serde_json::from_slice(response.body()).map_err(|e| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!("failed to parse query plan response: {e}"), + ) + })?; + + // Build the fan-out pipeline using the query plan. + let container_ref = container.clone(); + let mut topology = CachedTopologyProvider::new( + &self.pk_range_cache, + container_ref, + |container, continuation| self.fetch_partition_key_ranges(container, continuation), + ); + + let pipeline = planner::build_sequential_drain(&query_plan, &mut topology, &operation).await?; + Ok(OperationPlan::new(pipeline)) + } + + /// Fetches partition key ranges from the service for the given container. + /// + /// Used as the fetch function for [`CachedTopologyProvider`]. + async fn fetch_partition_key_ranges( + &self, + container: ContainerReference, + continuation: Option, + ) -> Option { + let operation = CosmosOperation::read_partition_key_ranges(container); + let overrides = OperationOverrides { + continuation, + ..Default::default() + }; + let options = OperationOptions::default(); + + let response = self + .execute_operation_direct(&operation, overrides, &options) + .await + .ok()?; + + let not_modified = u16::from(response.status().status_code()) == 304; + let etag_continuation = response + .headers() + .etag + .as_ref() + .map(|e| e.as_str().to_owned()); + + if not_modified { + return Some(PkRangeFetchResult { + ranges: Vec::new(), + continuation: etag_continuation, + not_modified: true, + }); + } + + let pk_ranges_response: crate::models::partition_key_range::PkRangesResponse = + serde_json::from_slice(response.body()).ok()?; + + Some(PkRangeFetchResult { + ranges: pk_ranges_response.partition_key_ranges, + continuation: etag_continuation, + not_modified: false, + }) + } } #[cfg(test)] diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs index 7e419702528..9c652fe5001 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs @@ -92,6 +92,10 @@ impl PipelineNode for SequentialDrain { ChildNodes::Split(front, back) } } + + fn into_children(self) -> Vec> { + self.children.into_iter().collect() + } } #[cfg(test)] @@ -184,7 +188,7 @@ mod tests { let mut context = PipelineContext::new(&mut executor, &mut topology); let err = drain.next_page(&mut context).await.unwrap_err(); - assert!(err.to_string().contains("test error")); + assert_eq!(err.to_string(), "test error"); } #[tokio::test] @@ -347,7 +351,10 @@ mod tests { let mut context = PipelineContext::new(&mut executor, &mut topology); let err = drain.next_page(&mut context).await.unwrap_err(); - assert!(err.to_string().contains("split retries")); + assert_eq!( + err.to_string(), + "exceeded maximum split retries (10) in SequentialDrain" + ); } #[tokio::test] @@ -430,7 +437,7 @@ mod tests { b"ok" ); let err = drain.next_page(&mut context).await.unwrap_err(); - assert!(err.to_string().contains("boom")); + assert_eq!(err.to_string(), "boom"); } #[tokio::test] diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs index 7b35c3688c0..3d770f8c261 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs @@ -52,6 +52,10 @@ impl PipelineNode for MockLeaf { fn children(&self) -> ChildNodes<'_> { ChildNodes::None } + + fn into_children(self) -> Vec> { + vec![] + } } // ── Request executors ─────────────────────────────────────────────────────── @@ -117,6 +121,7 @@ impl TopologyProvider for NoopTopologyProvider { fn resolve_ranges<'a>( &'a mut self, _range: &'a FeedRange, + _refresh: PartitionRoutingRefresh, ) -> BoxFuture<'a, azure_core::Result>> { Box::pin(async { Err(azure_core::Error::with_message( @@ -144,6 +149,7 @@ impl TopologyProvider for MockTopologyProvider { fn resolve_ranges<'a>( &'a mut self, _range: &'a FeedRange, + _refresh: PartitionRoutingRefresh, ) -> BoxFuture<'a, azure_core::Result>> { let result = self .results diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs index 0ad72efcc34..49a5a3d872b 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -7,9 +7,12 @@ mod drain; #[cfg(test)] pub(crate) mod mocks; pub(crate) mod planner; +pub(crate) mod query_plan; mod request; mod topology; +use std::ops::Index; + use futures::future::BoxFuture; use crate::models::{CosmosOperation, CosmosResponse, FeedRange}; @@ -41,16 +44,22 @@ pub(crate) trait RequestExecutor: Send { /// Resolves EPK ranges to their current physical partition key ranges. /// -/// Used by pipeline nodes to recover from partition topology changes (splits). +/// Used by pipeline nodes to recover from partition topology changes (splits) +/// and by the planner to resolve initial query ranges. /// The `PartitionKeyRangeCache` implements this trait in production. pub(crate) trait TopologyProvider: Send { /// Resolves the physical partitions that currently cover the given EPK range. /// + /// `refresh` controls whether the topology cache is refreshed before resolving: + /// callers use [`PartitionRoutingRefresh::ForceRefresh`] for split recovery + /// and [`PartitionRoutingRefresh::UseCached`] for planning. + /// /// Returns partition key range IDs paired with their EPK sub-ranges, ordered /// by EPK from smallest to largest. fn resolve_ranges<'a>( &'a mut self, range: &'a FeedRange, + refresh: PartitionRoutingRefresh, ) -> BoxFuture<'a, azure_core::Result>>; } @@ -96,8 +105,9 @@ impl<'a> PipelineContext<'a> { async fn resolve_ranges( &mut self, range: &FeedRange, + refresh: PartitionRoutingRefresh, ) -> azure_core::Result> { - self.topology_provider.resolve_ranges(range).await + self.topology_provider.resolve_ranges(range, refresh).await } } @@ -163,6 +173,24 @@ impl<'a> ChildNodes<'a> { } } +impl<'a> Index for ChildNodes<'a> { + type Output = Box; + + fn index(&self, index: usize) -> &Self::Output { + match self { + ChildNodes::None => panic!("index out of bounds"), + ChildNodes::Slice(s) => &s[index], + ChildNodes::Split(a, b) => { + if index < a.len() { + &a[index] + } else { + &b[index - a.len()] + } + } + } + } +} + impl<'a> IntoIterator for ChildNodes<'a> { type Item = &'a Box; type IntoIter = std::iter::Chain< @@ -195,6 +223,9 @@ pub(crate) trait PipelineNode: Send + std::any::Any { /// Returns the node's children for diagnostic inspection. fn children(&self) -> ChildNodes<'_>; + + /// Consumes this node and returns its children as a `Vec`. + fn into_children(self) -> Vec>; } impl dyn PipelineNode { @@ -202,6 +233,11 @@ impl dyn PipelineNode { pub(crate) fn downcast_ref(&self) -> Option<&T> { (self as &dyn std::any::Any).downcast_ref::() } + + /// Downcasts this node to a concrete type. + pub(crate) fn downcast(self: Box) -> Option> { + (self as Box).downcast::().ok() + } } /// A pipeline root that owns the node tree. @@ -209,6 +245,12 @@ pub(crate) struct Pipeline { root: Box, } +impl std::fmt::Debug for Pipeline { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Pipeline").finish_non_exhaustive() + } +} + impl Pipeline { /// Creates a pipeline from an owned root node. pub(crate) fn new(root: Box) -> Self { @@ -220,6 +262,11 @@ impl Pipeline { &*self.root } + /// Consumes the pipeline and returns the root node. + pub(crate) fn into_root(self) -> Box { + self.root + } + /// Emits the next page from the root node. /// /// Returns `Ok(Some(response))` for a page, `Ok(None)` when drained. @@ -242,6 +289,21 @@ impl Pipeline { } } +/// An opaque plan for executing a Cosmos DB operation. +/// +/// Wraps the internal dataflow [`Pipeline`] to hide its structure from callers. +/// Produced by [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation). +pub struct OperationPlan { + pub(crate) pipeline: Pipeline, +} + +impl OperationPlan { + /// Creates an operation plan wrapping the given pipeline. + pub(crate) fn new(pipeline: Pipeline) -> Self { + Self { pipeline } + } +} + #[cfg(test)] mod tests { use super::mocks::*; diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs index 259357409ee..eeefd749bca 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs @@ -5,18 +5,40 @@ //! //! The planner validates an operation's target against its resource type and //! constructs the appropriate dataflow [`Pipeline`]. +//! +//! For cross-partition queries, [`build_sequential_drain`] consumes a backend +//! [`QueryPlan`](super::query_plan::QueryPlan) and resolves the query's EPK +//! ranges against the current topology to produce a fan-out pipeline. -use crate::models::{CosmosOperation, OperationTarget}; +use crate::models::{ + effective_partition_key::EffectivePartitionKey, CosmosOperation, FeedRange, OperationTarget, +}; -use super::{Pipeline, Request, RequestTarget}; +use super::{ + query_plan::{QueryInfo, QueryPlan}, + PartitionRoutingRefresh, Pipeline, PipelineNode, Request, RequestTarget, SequentialDrain, + TopologyProvider, +}; -/// Validates and builds a [`Pipeline`] for the given operation. +/// Builds a single-node [`Pipeline`] for a trivial operation. +/// +/// Trivial operations are those that can be satisfied by a single request to +/// one partition (point reads, single-partition queries, metadata operations). +/// Use [`CosmosOperation::is_trivial`] to check eligibility before calling. /// -/// This is the "Planning" phase of operation execution. It: -/// 1. Validates that the operation's target is compatible with its resource type. -/// 2. Maps the operation target to a pipeline node tree (currently a single -/// [`Request`] leaf node for point and single-partition operations). -pub(crate) fn plan_pipeline(operation: &CosmosOperation) -> azure_core::Result { +/// # Panics (debug builds) +/// +/// Debug-asserts that the operation is indeed trivial. In release builds, +/// returns an error if a non-trivial operation (e.g. a cross-partition query) +/// is passed. +pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core::Result { + debug_assert!( + operation.is_trivial(), + "build_trivial_pipeline called with non-trivial operation: {:?} targeting {:?}", + operation.operation_type(), + operation.target(), + ); + let resource_type = operation.resource_type(); let target = operation.target(); @@ -36,8 +58,8 @@ pub(crate) fn plan_pipeline(operation: &CosmosOperation) -> azure_core::Result

{ return Err(azure_core::Error::with_message( azure_core::error::ErrorKind::Other, - "FeedRange targeting is not yet implemented; \ - fan-out pipeline planning requires partition resolution", + "FeedRange targeting requires a fan-out pipeline; \ + use plan_operation for cross-partition queries", )); } }; @@ -46,6 +68,105 @@ pub(crate) fn plan_pipeline(operation: &CosmosOperation) -> azure_core::Result

azure_core::Result { + validate_query_plan(query_plan)?; + + // Convert query ranges to FeedRanges and resolve against topology. + let mut request_nodes: Vec> = Vec::new(); + for query_range in &query_plan.query_ranges { + let min = EffectivePartitionKey::from(query_range.min.as_str()); + let max = EffectivePartitionKey::from(query_range.max.as_str()); + let feed_range = FeedRange::new(min, max); + let resolved = topology_provider + .resolve_ranges(&feed_range, PartitionRoutingRefresh::UseCached) + .await?; + + for resolved_range in resolved { + let target = RequestTarget::EffectivePartitionKeyRange { + range: resolved_range.range, + partition_key_range_id: resolved_range.partition_key_range_id, + }; + request_nodes.push(Box::new(Request::new(operation.clone(), target))); + } + } + + // TODO: enforce max fan-out (default 100, configurable). See FEED_OPERATIONS_REQS.md §3. + + if request_nodes.is_empty() { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "query plan produced no partition ranges to query", + )); + } + + let root: Box = if request_nodes.len() == 1 { + request_nodes.into_iter().next().unwrap() + } else { + Box::new(SequentialDrain::new(request_nodes)) + }; + + Ok(Pipeline::new(root)) +} + +/// Validates that the query plan does not require features we don't yet support. +fn validate_query_plan(plan: &QueryPlan) -> azure_core::Result<()> { + if plan.hybrid_search_query_info.is_some() { + return Err(unsupported_feature("hybrid search queries")); + } + + if let Some(info) = &plan.query_info { + validate_query_info(info)?; + } + + Ok(()) +} + +fn validate_query_info(info: &QueryInfo) -> azure_core::Result<()> { + if info.top.is_some() { + return Err(unsupported_feature("TOP clause in cross-partition queries")); + } + if info.limit.is_some() { + return Err(unsupported_feature( + "LIMIT clause in cross-partition queries", + )); + } + if !info.order_by.is_empty() { + return Err(unsupported_feature("ORDER BY in cross-partition queries")); + } + if !info.aggregates.is_empty() { + return Err(unsupported_feature("aggregates in cross-partition queries")); + } + if !info.group_by_expressions.is_empty() { + return Err(unsupported_feature("GROUP BY in cross-partition queries")); + } + Ok(()) +} + +fn unsupported_feature(feature: &str) -> azure_core::Error { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + format!("unsupported query feature: {feature}"), + ) +} + fn target_description(target: &OperationTarget) -> &'static str { match target { OperationTarget::None => "None", @@ -58,11 +179,16 @@ fn target_description(target: &OperationTarget) -> &'static str { mod tests { use std::borrow::Cow; + use futures::FutureExt as _; + use super::*; - use crate::models::{ - AccountReference, ContainerProperties, ContainerReference, DatabaseReference, - ItemReference, OperationType, PartitionKey, PartitionKeyDefinition, ResourceType, - SystemProperties, + use crate::{ + driver::dataflow::{mocks::*, query_plan::QueryRange, ResolvedRange}, + models::{ + effective_partition_key::EffectivePartitionKey, AccountReference, ContainerProperties, + ContainerReference, DatabaseReference, ItemReference, OperationType, PartitionKey, + PartitionKeyDefinition, ResourceType, SystemProperties, + }, }; fn test_account() -> AccountReference { @@ -99,12 +225,20 @@ mod tests { ) } - // --- plan_pipeline tests --- + fn cross_partition_query_operation() -> CosmosOperation { + CosmosOperation::query_items( + test_container(), + OperationTarget::FeedRange(FeedRange::full()), + ) + .with_body(br#"{"query":"SELECT * FROM c"}"#.to_vec()) + } + + // --- build_trivial_pipeline tests --- #[test] fn plans_non_partitioned_pipeline_for_database_read() { let op = CosmosOperation::read_database(test_database()); - let pipeline = plan_pipeline(&op).unwrap(); + let pipeline = build_trivial_pipeline(&op).unwrap(); let request = pipeline.root().downcast_ref::().unwrap(); assert_eq!(*request.target(), RequestTarget::NonPartitioned); @@ -117,7 +251,7 @@ mod tests { let pk = PartitionKey::from("pk-value"); let item = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let op = CosmosOperation::read_item(item); - let pipeline = plan_pipeline(&op).unwrap(); + let pipeline = build_trivial_pipeline(&op).unwrap(); let request = pipeline.root().downcast_ref::().unwrap(); assert_eq!( @@ -131,12 +265,379 @@ mod tests { #[test] fn rejects_feed_range_target() { let op = CosmosOperation::read_all_items_cross_partition(test_container()); - let result = plan_pipeline(&op); - let err = result.err().expect("expected error for FeedRange target"); - assert!( - err.to_string().contains("FeedRange"), - "expected FeedRange error, got: {err}" + // In debug builds, this panics via debug_assert; in release builds it returns Err. + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + build_trivial_pipeline(&op) + })); + + match result { + // Panicked in debug mode (expected) + Err(_) if cfg!(debug_assertions) => {} + // Panicked in release mode (bad) + Err(_) => panic!("did not expect panic for FeedRange target"), + // Returned Err in release mode (also acceptable) + Ok(Err(err)) => { + assert_eq!( + err.to_string(), + "FeedRange targeting requires a fan-out pipeline; \ + use plan_operation for cross-partition queries" + ); + } + _ => panic!("expected error or panic for FeedRange target"), + } + } + + // --- build_sequential_drain tests --- + + /// Shorthand to build a `QueryRange` from hex-prefix EPK strings. + fn qr(min: &str, max: &str) -> QueryRange { + QueryRange { + min: min.to_string(), + max: max.to_string(), + is_min_inclusive: true, + is_max_inclusive: false, + } + } + + /// Shorthand to build a `ResolvedRange` from (min, max, pk_range_id). + fn rr(min: &str, max: &str, pk_range_id: &str) -> ResolvedRange { + ResolvedRange { + partition_key_range_id: pk_range_id.to_string(), + range: FeedRange::new( + EffectivePartitionKey::from(min), + EffectivePartitionKey::from(max), + ), + } + } + + /// Builds a query plan with the given query ranges (and no query info). + fn plan_with_ranges(ranges: Vec) -> QueryPlan { + QueryPlan { + partitioned_query_execution_info_version: 1, + query_info: None, + query_ranges: ranges, + hybrid_search_query_info: None, + } + } + + /// Asserts that the pipeline is a single `Request` targeting the expected EPK range. + fn assert_single_request( + pipeline: &Pipeline, + expected_min: &str, + expected_max: &str, + expected_pk_range_id: &str, + ) { + let request = pipeline + .root() + .downcast_ref::() + .expect("expected single Request root"); + assert_eq!( + *request.target(), + RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::from(expected_min), + EffectivePartitionKey::from(expected_max), + ), + partition_key_range_id: expected_pk_range_id.to_string(), + } + ); + } + + /// Asserts that the pipeline is a `SequentialDrain` containing `Request` nodes + /// targeting the given EPK ranges (in order). + fn assert_drain_requests(pipeline: Pipeline, expected: &[(&str, &str, &str)]) { + let drain = pipeline + .into_root() + .downcast::() + .expect("expected SequentialDrain root"); + let children = drain.into_children(); + assert_eq!( + children.len(), + expected.len(), + "expected {} request nodes, got {}", + expected.len(), + children.len(), ); + for (child, &(min, max, pk_range_id)) in children.into_iter().zip(expected) { + let request = child + .downcast::() + .expect("expected Request child node"); + assert_eq!( + *request.target(), + RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::from(min), + EffectivePartitionKey::from(max), + ), + partition_key_range_id: pk_range_id.to_string(), + }, + "mismatch for pk range {pk_range_id}" + ); + } + } + + #[tokio::test] + async fn builds_single_node_pipeline_for_one_partition() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-0")])]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_single_request(&pipeline, "", "FF", "pkrange-0"); + } + + #[tokio::test] + async fn builds_sequential_drain_for_multiple_partitions() { + // Query targets full range, topology has two partitions split at "80". + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + rr("", "80", "pkrange-left"), + rr("80", "FF", "pkrange-right"), + ])]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_drain_requests( + pipeline, + &[("", "80", "pkrange-left"), ("80", "FF", "pkrange-right")], + ); + } + + #[tokio::test] + async fn builds_pipeline_for_multiple_query_ranges() { + // Query plan specifies two disjoint query ranges; each resolves to one partition. + let plan = plan_with_ranges(vec![qr("", "40"), qr("80", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![ + Ok(vec![rr("", "40", "pkrange-A")]), + Ok(vec![rr("80", "FF", "pkrange-C")]), + ]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_drain_requests( + pipeline, + &[("", "40", "pkrange-A"), ("80", "FF", "pkrange-C")], + ); + } + + #[tokio::test] + async fn query_range_spans_multiple_topology_partitions() { + // A single query range [00, C0) spans three topology partitions. + let plan = plan_with_ranges(vec![qr("00", "C0")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + rr("00", "40", "pkrange-1"), + rr("40", "80", "pkrange-2"), + rr("80", "C0", "pkrange-3"), + ])]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_drain_requests( + pipeline, + &[ + ("00", "40", "pkrange-1"), + ("40", "80", "pkrange-2"), + ("80", "C0", "pkrange-3"), + ], + ); + } + + #[tokio::test] + async fn multiple_query_ranges_each_spanning_multiple_partitions() { + // Two query ranges, each resolving to multiple partitions. The resulting + // pipeline should have all resolved ranges in order. + let plan = plan_with_ranges(vec![qr("", "60"), qr("A0", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![ + // First query range [, 60) spans two partitions. + Ok(vec![ + rr("", "30", "pkrange-alpha"), + rr("30", "60", "pkrange-beta"), + ]), + // Second query range [A0, FF) spans two partitions. + Ok(vec![ + rr("A0", "D0", "pkrange-gamma"), + rr("D0", "FF", "pkrange-delta"), + ]), + ]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_drain_requests( + pipeline, + &[ + ("", "30", "pkrange-alpha"), + ("30", "60", "pkrange-beta"), + ("A0", "D0", "pkrange-gamma"), + ("D0", "FF", "pkrange-delta"), + ], + ); + } + + #[tokio::test] + async fn topology_partition_wider_than_query_range() { + // The topology partition [, FF) is wider than query range [20, 80). + // The resolved range matches the topology, not the query range. + let plan = plan_with_ranges(vec![qr("20", "80")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-wide")])]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_single_request(&pipeline, "", "FF", "pkrange-wide"); + } + + #[tokio::test] + async fn rejects_query_plan_with_top() { + let plan = QueryPlan { + query_info: Some(QueryInfo { + top: Some(10), + ..Default::default() + }), + ..plan_with_ranges(vec![qr("", "FF")]) + }; + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "unsupported query feature: TOP clause in cross-partition queries" + ); + } + + #[tokio::test] + async fn rejects_query_plan_with_limit() { + let plan = QueryPlan { + query_info: Some(QueryInfo { + limit: Some(20), + ..Default::default() + }), + ..plan_with_ranges(vec![qr("", "FF")]) + }; + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "unsupported query feature: LIMIT clause in cross-partition queries" + ); + } + + #[tokio::test] + async fn rejects_query_plan_with_order_by() { + use super::super::query_plan::SortOrder; + let plan = QueryPlan { + query_info: Some(QueryInfo { + order_by: vec![SortOrder::Ascending], + ..Default::default() + }), + ..plan_with_ranges(vec![qr("", "FF")]) + }; + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "unsupported query feature: ORDER BY in cross-partition queries" + ); + } + + #[tokio::test] + async fn rejects_query_plan_with_aggregates() { + let plan = QueryPlan { + query_info: Some(QueryInfo { + aggregates: vec!["Count".to_string()], + ..Default::default() + }), + ..plan_with_ranges(vec![qr("", "FF")]) + }; + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "unsupported query feature: aggregates in cross-partition queries" + ); + } + + #[tokio::test] + async fn rejects_query_plan_with_group_by() { + let plan = QueryPlan { + query_info: Some(QueryInfo { + group_by_expressions: vec!["c.category".to_string()], + ..Default::default() + }), + ..plan_with_ranges(vec![qr("", "FF")]) + }; + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "unsupported query feature: GROUP BY in cross-partition queries" + ); + } + + #[tokio::test] + async fn rejects_query_plan_with_hybrid_search() { + let plan = QueryPlan { + hybrid_search_query_info: Some(super::super::query_plan::HybridSearchQueryInfo { + global_statistics_query: "SELECT COUNT(1) FROM c".to_string(), + component_query_infos: vec![], + component_weights: vec![], + skip: None, + take: Some(10), + requires_global_statistics: true, + }), + ..plan_with_ranges(vec![qr("", "FF")]) + }; + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "unsupported query feature: hybrid search queries" + ); + } + + #[tokio::test] + async fn accepts_query_plan_with_no_query_info() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-0")])]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + assert_single_request(&pipeline, "", "FF", "pkrange-0"); + } + + #[tokio::test] + async fn rejects_empty_query_ranges() { + let plan = plan_with_ranges(vec![]); + let op = cross_partition_query_operation(); + let mut topology = NoopTopologyProvider; + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!( + err.to_string(), + "query plan produced no partition ranges to query" + ); + } + + #[tokio::test] + async fn propagates_topology_resolution_error() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "topology resolution failed", + ))]); + + let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + assert_eq!(err.to_string(), "topology resolution failed"); } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs new file mode 100644 index 00000000000..78f9966ba15 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs @@ -0,0 +1,263 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Backend query plan models. +//! +//! These types model the response from the Cosmos DB Gateway when issuing a +//! query plan request (`OperationType::QueryPlan`). The planner uses them to +//! determine partition targeting, detect unsupported query features, and build +//! the dataflow pipeline. + +use std::collections::HashMap; + +use serde::Deserialize; + +/// The response returned by the Gateway for a query plan request. +#[derive(Debug, Default, Deserialize)] +#[serde(rename_all = "camelCase")] +pub(crate) struct QueryPlan { + /// The version of the query plan format. + pub partitioned_query_execution_info_version: usize, + + /// Detailed query information (ordering, aggregates, rewrites, etc.). + #[serde(default)] + pub query_info: Option, + + /// The EPK ranges that the query references. + /// + /// Used by the planner to limit which physical partitions get queried. + pub query_ranges: Vec, + + /// Information about hybrid search queries, if applicable. + pub hybrid_search_query_info: Option, +} + +/// Information about a hybrid search query. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub(crate) struct HybridSearchQueryInfo { + /// The query used for global statistics gathering. + pub global_statistics_query: String, + + /// Individual component queries that make up the hybrid search. + pub component_query_infos: Vec, + + /// Weights assigned to each component query. + #[serde(default)] + pub component_weights: Vec, + + /// Number of results to skip. + pub skip: Option, + + /// Number of results to take (always present for hybrid search). + pub take: Option, + + /// Whether global statistics are required. + pub requires_global_statistics: bool, +} + +/// The kind of DISTINCT tracking required by the query. +#[derive(Debug, Deserialize, Default, PartialEq, Eq)] +pub(crate) enum DistinctType { + /// No deduplication required. + #[default] + None, + + /// Order-preserving deduplication. + Ordered, + + /// Order-independent deduplication. + Unordered, +} + +/// Detailed query plan information. +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +#[serde(rename_all = "camelCase")] +pub(crate) struct QueryInfo { + /// The kind of DISTINCT clause, if any. + pub distinct_type: DistinctType, + + /// `TOP` clause limit. + pub top: Option, + + /// `OFFSET` clause value. + pub offset: Option, + + /// `LIMIT` clause value (from `OFFSET`/`LIMIT`). + pub limit: Option, + + /// Sort orders for `ORDER BY` expressions. + pub order_by: Vec, + + /// Expressions used by `ORDER BY` clauses. + pub order_by_expressions: Vec, + + /// Expressions used by `GROUP BY` clauses. + pub group_by_expressions: Vec, + + /// Aliases used by `GROUP BY` clauses. + pub group_by_aliases: Vec, + + /// Aggregates used in the `SELECT` portion of a `GROUP BY` query. + pub aggregates: Vec, + + /// Mapping from GROUP BY aliases to aggregate types. + pub group_by_alias_to_aggregate_type: HashMap, + + /// Rewritten form of the query for single-partition sub-queries. + /// + /// When non-empty, this should be used instead of the original query text + /// for individual partition requests. + pub rewritten_query: String, + + /// Whether the query contains a `SELECT VALUE` clause. + pub has_select_value: bool, + + /// Whether the query contains a non-streaming `ORDER BY`. + pub has_non_streaming_order_by: bool, +} + +/// Sort order for an `ORDER BY` expression. +#[derive(Debug, Deserialize, Clone, Copy, PartialEq, Eq)] +pub(crate) enum SortOrder { + Ascending, + Descending, +} + +/// An EPK range covered by the query. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub(crate) struct QueryRange { + /// The minimum EPK value. + pub min: String, + + /// The maximum EPK value. + pub max: String, + + /// Whether the minimum value is inclusive. + pub is_min_inclusive: bool, + + /// Whether the maximum value is inclusive. + pub is_max_inclusive: bool, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn deserializes_minimal_query_plan() { + let json = r#"{ + "partitionedQueryExecutionInfoVersion": 1, + "queryRanges": [ + { + "min": "", + "max": "FF", + "isMinInclusive": true, + "isMaxInclusive": false + } + ] + }"#; + let plan: QueryPlan = serde_json::from_str(json).unwrap(); + assert_eq!(plan.partitioned_query_execution_info_version, 1); + assert!(plan.query_info.is_none()); + assert!(plan.hybrid_search_query_info.is_none()); + assert_eq!(plan.query_ranges.len(), 1); + assert_eq!(plan.query_ranges[0].min, ""); + assert_eq!(plan.query_ranges[0].max, "FF"); + assert!(plan.query_ranges[0].is_min_inclusive); + assert!(!plan.query_ranges[0].is_max_inclusive); + } + + #[test] + fn deserializes_query_plan_with_order_by() { + let json = r#"{ + "partitionedQueryExecutionInfoVersion": 2, + "queryInfo": { + "orderBy": ["Ascending", "Descending"], + "orderByExpressions": ["c.name", "c.age"], + "rewrittenQuery": "SELECT c.name, c.age FROM c ORDER BY c.name ASC, c.age DESC" + }, + "queryRanges": [] + }"#; + let plan: QueryPlan = serde_json::from_str(json).unwrap(); + let info = plan.query_info.unwrap(); + assert_eq!( + info.order_by, + vec![SortOrder::Ascending, SortOrder::Descending] + ); + assert_eq!(info.order_by_expressions, vec!["c.name", "c.age"]); + } + + #[test] + fn deserializes_query_plan_with_top_and_aggregates() { + let json = r#"{ + "partitionedQueryExecutionInfoVersion": 1, + "queryInfo": { + "top": 10, + "aggregates": ["Count"], + "distinctType": "Ordered" + }, + "queryRanges": [] + }"#; + let plan: QueryPlan = serde_json::from_str(json).unwrap(); + let info = plan.query_info.unwrap(); + assert_eq!(info.top, Some(10)); + assert_eq!(info.aggregates, vec!["Count"]); + assert_eq!(info.distinct_type, DistinctType::Ordered); + } + + #[test] + fn deserializes_query_plan_with_hybrid_search() { + let json = r#"{ + "partitionedQueryExecutionInfoVersion": 1, + "queryRanges": [], + "hybridSearchQueryInfo": { + "globalStatisticsQuery": "SELECT COUNT(1) FROM c", + "componentQueryInfos": [], + "componentWeights": [0.5, 0.5], + "skip": null, + "take": 10, + "requiresGlobalStatistics": true + } + }"#; + let plan: QueryPlan = serde_json::from_str(json).unwrap(); + let hybrid = plan.hybrid_search_query_info.unwrap(); + assert_eq!(hybrid.global_statistics_query, "SELECT COUNT(1) FROM c"); + assert_eq!(hybrid.component_weights, vec![0.5, 0.5]); + assert_eq!(hybrid.take, Some(10)); + assert!(hybrid.requires_global_statistics); + } + + #[test] + fn deserializes_query_plan_with_offset_limit() { + let json = r#"{ + "partitionedQueryExecutionInfoVersion": 1, + "queryInfo": { + "offset": 5, + "limit": 20 + }, + "queryRanges": [] + }"#; + let plan: QueryPlan = serde_json::from_str(json).unwrap(); + let info = plan.query_info.unwrap(); + assert_eq!(info.offset, Some(5)); + assert_eq!(info.limit, Some(20)); + } + + #[test] + fn deserializes_multiple_query_ranges() { + let json = r#"{ + "partitionedQueryExecutionInfoVersion": 1, + "queryRanges": [ + { "min": "", "max": "40", "isMinInclusive": true, "isMaxInclusive": false }, + { "min": "80", "max": "FF", "isMinInclusive": true, "isMaxInclusive": false } + ] + }"#; + let plan: QueryPlan = serde_json::from_str(json).unwrap(); + assert_eq!(plan.query_ranges.len(), 2); + assert_eq!(plan.query_ranges[0].max, "40"); + assert_eq!(plan.query_ranges[1].min, "80"); + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index 6f27316ef02..a5433cce133 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -112,6 +112,10 @@ impl PipelineNode for Request { fn children(&self) -> ChildNodes<'_> { ChildNodes::None } + + fn into_children(self) -> Vec> { + Vec::new() + } } impl Request { async fn handle_partition_topology_change( @@ -159,7 +163,9 @@ impl Request { context: &mut PipelineContext<'_>, range: &FeedRange, ) -> azure_core::Result { - let resolved = context.resolve_ranges(range).await?; + let resolved = context + .resolve_ranges(range, PartitionRoutingRefresh::ForceRefresh) + .await?; let replacement_nodes: Vec> = resolved .into_iter() @@ -495,7 +501,7 @@ mod tests { let mut context = PipelineContext::new(&mut executor, &mut topology); let err = request.next_page(&mut context).await.unwrap_err(); - assert!(err.to_string().contains("topology fetch failed")); + assert_eq!(err.to_string(), "topology fetch failed"); } #[tokio::test] diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs index 1a47b40f8c1..c40cae64e4f 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/topology.rs @@ -10,15 +10,15 @@ use crate::{ models::{ContainerReference, FeedRange}, }; -use super::{ResolvedRange, TopologyProvider}; +use super::{PartitionRoutingRefresh, ResolvedRange, TopologyProvider}; /// Adapts [`PartitionKeyRangeCache`] to the [`TopologyProvider`] trait. /// /// Holds a reference to the cache, the container being queried, and a function /// that fetches partition key ranges from the service. On each -/// [`resolve_ranges`](TopologyProvider::resolve_ranges) call, it force-refreshes -/// the cache (since splits are the reason we're resolving) and converts the -/// resulting `PartitionKeyRange` objects to [`ResolvedRange`] values. +/// [`resolve_ranges`](TopologyProvider::resolve_ranges) call, it uses the +/// provided [`PartitionRoutingRefresh`](super::PartitionRoutingRefresh) to +/// decide whether to refresh the cache first. /// /// # Type parameters /// @@ -54,15 +54,16 @@ where fn resolve_ranges<'a>( &'a mut self, range: &'a FeedRange, + refresh: PartitionRoutingRefresh, ) -> BoxFuture<'a, azure_core::Result>> { + let force_refresh = matches!(refresh, PartitionRoutingRefresh::ForceRefresh); Box::pin(async move { - // Force-refresh because we're recovering from a topology change (split). let pk_ranges = self .cache .resolve_overlapping_ranges( &self.container, range.min_inclusive()..range.max_exclusive(), - true, + force_refresh, &self.fetch_pk_ranges, ) .await; @@ -186,7 +187,10 @@ mod tests { let mut provider = CachedTopologyProvider::new(&cache, make_container(), single_range_fetch); - let ranges = provider.resolve_ranges(&FeedRange::full()).await.unwrap(); + let ranges = provider + .resolve_ranges(&FeedRange::full(), PartitionRoutingRefresh::ForceRefresh) + .await + .unwrap(); assert_eq!(ranges.len(), 1); assert_eq!(ranges[0].partition_key_range_id, "0"); @@ -205,7 +209,10 @@ mod tests { let cache = PartitionKeyRangeCache::new(); let mut provider = CachedTopologyProvider::new(&cache, make_container(), two_range_fetch); - let ranges = provider.resolve_ranges(&FeedRange::full()).await.unwrap(); + let ranges = provider + .resolve_ranges(&FeedRange::full(), PartitionRoutingRefresh::ForceRefresh) + .await + .unwrap(); assert_eq!(ranges.len(), 2); assert_eq!(ranges[0].partition_key_range_id, "1"); @@ -237,7 +244,10 @@ mod tests { EffectivePartitionKey::min(), EffectivePartitionKey::from("80"), ); - let ranges = provider.resolve_ranges(&left_half).await.unwrap(); + let ranges = provider + .resolve_ranges(&left_half, PartitionRoutingRefresh::ForceRefresh) + .await + .unwrap(); assert_eq!(ranges.len(), 1); assert_eq!(ranges[0].partition_key_range_id, "1"); @@ -248,7 +258,10 @@ mod tests { let cache = PartitionKeyRangeCache::new(); let mut provider = CachedTopologyProvider::new(&cache, make_container(), three_range_fetch); - let ranges = provider.resolve_ranges(&FeedRange::full()).await.unwrap(); + let ranges = provider + .resolve_ranges(&FeedRange::full(), PartitionRoutingRefresh::ForceRefresh) + .await + .unwrap(); assert_eq!(ranges.len(), 3); assert_eq!(ranges[0].partition_key_range_id, "1"); @@ -262,11 +275,12 @@ mod tests { let mut provider = CachedTopologyProvider::new(&cache, make_container(), failing_fetch); let err = provider - .resolve_ranges(&FeedRange::full()) + .resolve_ranges(&FeedRange::full(), PartitionRoutingRefresh::ForceRefresh) .await .unwrap_err(); - assert!(err - .to_string() - .contains("failed to resolve partition key ranges")); + assert_eq!( + err.to_string(), + "failed to resolve partition key ranges from topology cache" + ); } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs index 22fab1801f0..bc899604699 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/mod.rs @@ -21,6 +21,7 @@ mod runtime; pub(crate) mod transport; pub use cosmos_driver::CosmosDriver; +pub use dataflow::OperationPlan; pub use runtime::{CosmosDriverRuntime, CosmosDriverRuntimeBuilder}; /// Walks an error's `.source()` chain and joins all distinct messages into a diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/lib.rs b/sdk/cosmos/azure_data_cosmos_driver/src/lib.rs index 978f929bdc3..36f4cbc7841 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/lib.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/lib.rs @@ -32,6 +32,6 @@ pub mod testing; // Re-export key types at crate root pub use diagnostics::{DiagnosticsContext, ExecutionContext, RequestDiagnostics, RequestHandle}; -pub use driver::{CosmosDriver, CosmosDriverRuntime, CosmosDriverRuntimeBuilder}; +pub use driver::{CosmosDriver, CosmosDriverRuntime, CosmosDriverRuntimeBuilder, OperationPlan}; pub use models::{ActivityId, CosmosResponse, CosmosStatus, RequestCharge}; pub use options::{DiagnosticsOptions, DiagnosticsVerbosity, DriverOptions}; diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs index 22c60131fc2..85ea6e668dc 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs @@ -33,6 +33,7 @@ pub(crate) mod request_header_names { pub const END_EPK: &str = "x-ms-end-epk"; pub const PARTITION_KEY: &str = "x-ms-documentdb-partitionkey"; pub const PARTITION_KEY_RANGE_ID: &str = "x-ms-documentdb-partitionkeyrangeid"; + pub const SUPPORTED_QUERY_FEATURES: &str = "x-ms-cosmos-supported-query-features"; } /// Standard Cosmos DB response header names. @@ -85,6 +86,12 @@ pub struct CosmosRequestHeaders { /// /// The driver serializes this to JSON for the header value. pub offer_autopilot_settings: Option, + + /// Supported query features (`x-ms-cosmos-supported-query-features`). + /// + /// Sent on query plan requests to indicate which query capabilities the + /// client supports. The backend uses this to shape its response. + pub supported_query_features: Option, } impl CosmosRequestHeaders { @@ -133,6 +140,12 @@ impl CosmosRequestHeaders { ); } } + if let Some(features) = self.supported_query_features.as_ref() { + headers.insert( + request_header_names::SUPPORTED_QUERY_FEATURES, + HeaderValue::from(features.clone()), + ); + } } } @@ -529,6 +542,7 @@ mod tests { precondition: None, offer_throughput: None, offer_autopilot_settings: None, + supported_query_features: None, }; assert_eq!( @@ -549,6 +563,7 @@ mod tests { precondition: None, offer_throughput: None, offer_autopilot_settings: None, + supported_query_features: None, }; let mut headers = Headers::new(); @@ -572,6 +587,7 @@ mod tests { precondition: Some(Precondition::if_match(ETag::new("etag-value-1"))), offer_throughput: None, offer_autopilot_settings: None, + supported_query_features: None, }; let mut headers = Headers::new(); @@ -595,6 +611,7 @@ mod tests { precondition: Some(Precondition::if_none_match(ETag::new("*"))), offer_throughput: None, offer_autopilot_settings: None, + supported_query_features: None, }; let mut headers = Headers::new(); @@ -618,6 +635,7 @@ mod tests { precondition: None, offer_throughput: None, offer_autopilot_settings: None, + supported_query_features: None, }; let mut headers = Headers::new(); @@ -641,6 +659,7 @@ mod tests { precondition: Some(Precondition::if_match(ETag::new("etag-abc"))), offer_throughput: None, offer_autopilot_settings: None, + supported_query_features: None, }; let mut headers = Headers::new(); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index 153f7bd6a7a..9b41714c56e 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -175,6 +175,10 @@ impl CosmosOperation { ) -> Self { let resource_reference = resource_reference.into(); let resource_type = resource_reference.resource_type(); + debug_assert!( + !resource_type.is_partitioned() || target.has_partition_reference(), + "Attempted to create a partitioned operation without an OperationTarget specifying the partitions to access" + ); Self { operation_type, resource_type, @@ -575,37 +579,49 @@ impl CosmosOperation { ) } - /// Queries items within a single partition. + /// Queries items in a container. + /// + /// The `target` determines partition scope: use + /// [`OperationTarget::PartitionKey`] for single-partition queries, or + /// [`OperationTarget::FeedRange`] for cross-partition queries. /// /// Use `with_body()` to provide the query JSON. - /// This is more efficient than cross-partition queries. - pub fn query_items(container: ContainerReference, partition_key: PartitionKey) -> Self { + pub fn query_items(container: ContainerReference, target: OperationTarget) -> Self { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); - Self::new( - OperationType::Query, - resource_ref, - OperationTarget::PartitionKey(partition_key), - ) + Self::new(OperationType::Query, resource_ref, target) } - /// Queries items across all partitions. + /// Creates a query plan request for a container. /// - /// Use `with_body()` to provide the query JSON. + /// The query plan request is sent to the backend gateway to obtain + /// execution metadata (partition targeting, rewritten query, etc.) + /// before issuing the actual cross-partition query. /// - /// **Warning:** Cross-partition queries are inherently less efficient than - /// single-partition queries. Use `query_items()` with a partition key - /// when possible. - pub fn query_items_cross_partition(container: ContainerReference) -> Self { + /// Use `with_body()` to provide the query JSON (same as the original query). + pub(crate) fn query_plan(container: ContainerReference) -> Self { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::Document) .into_feed_reference(); + let mut headers = CosmosRequestHeaders::new(); + headers.supported_query_features = Some(String::new()); Self::new( - OperationType::Query, + OperationType::QueryPlan, resource_ref, - OperationTarget::FeedRange(crate::models::FeedRange::full()), + OperationTarget::None, ) + .with_request_headers(headers) + } + + /// Creates a read-feed request for partition key ranges in a container. + /// + /// Used to populate the partition key range cache for topology resolution. + pub(crate) fn read_partition_key_ranges(container: ContainerReference) -> Self { + let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) + .with_resource_type(ResourceType::PartitionKeyRange) + .into_feed_reference(); + Self::new(OperationType::ReadFeed, resource_ref, OperationTarget::None) } /// Returns true if this is a read-only operation. @@ -618,6 +634,21 @@ impl CosmosOperation { self.operation_type.is_idempotent() } + /// Returns true if this operation can be planned with a single-node pipeline. + /// + /// An operation is "trivial" when it does not require fan-out across multiple + /// physical partitions. This includes all non-query operations and queries + /// that target a specific logical partition key (single-partition queries) + /// OR queries against a non-partitioned resource (Databases, Containers, Offers, etc.). + /// + /// Cross-partition queries (those targeting a [`FeedRange`](crate::models::FeedRange)) + /// are **not** trivial and require a backend query plan to determine the + /// fan-out strategy. + pub fn is_trivial(&self) -> bool { + self.operation_type != OperationType::Query + || !matches!(self.target(), OperationTarget::FeedRange(_)) + } + // -- Offer operations -- /// Queries offers in the account. @@ -698,10 +729,14 @@ mod tests { #[test] fn create_operation() { - let item_ref = - ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); + let pk = PartitionKey::from("pk1"); + let item_ref = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Create, resource_ref, OperationTarget::None); + let op = CosmosOperation::new( + OperationType::Create, + resource_ref, + OperationTarget::PartitionKey(pk), + ); assert_eq!(op.operation_type(), OperationType::Create); assert_eq!(op.resource_type(), ResourceType::Document); @@ -711,10 +746,14 @@ mod tests { #[test] fn read_operation() { - let item_ref = - ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); + let pk = PartitionKey::from("pk1"); + let item_ref = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Read, resource_ref, OperationTarget::None); + let op = CosmosOperation::new( + OperationType::Read, + resource_ref, + OperationTarget::PartitionKey(pk), + ); assert_eq!(op.operation_type(), OperationType::Read); assert_eq!(op.resource_type(), ResourceType::Document); @@ -738,22 +777,30 @@ mod tests { #[test] fn operation_with_body() { - let item_ref = - ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); + let pk = PartitionKey::from("pk1"); + let item_ref = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); let body = b"{\"id\":\"doc1\"}".to_vec(); - let op = CosmosOperation::new(OperationType::Create, resource_ref, OperationTarget::None) - .with_body(body.clone()); + let op = CosmosOperation::new( + OperationType::Create, + resource_ref, + OperationTarget::PartitionKey(pk), + ) + .with_body(body.clone()); assert_eq!(op.body(), Some(body.as_slice())); } #[test] fn replace_is_idempotent() { - let item_ref = - ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); + let pk = PartitionKey::from("pk1"); + let item_ref = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Replace, resource_ref, OperationTarget::None); + let op = CosmosOperation::new( + OperationType::Replace, + resource_ref, + OperationTarget::PartitionKey(pk), + ); assert!(!op.is_read_only()); assert!(op.is_idempotent()); @@ -761,12 +808,27 @@ mod tests { #[test] fn upsert_is_not_idempotent() { - let item_ref = - ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); + let pk = PartitionKey::from("pk1"); + let item_ref = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let resource_ref: CosmosResourceReference = item_ref.into(); - let op = CosmosOperation::new(OperationType::Upsert, resource_ref, OperationTarget::None); + let op = CosmosOperation::new( + OperationType::Upsert, + resource_ref, + OperationTarget::PartitionKey(pk), + ); assert!(!op.is_read_only()); assert!(!op.is_idempotent()); } + + /// Creating a partitioned operation without a partition target panics in + /// debug builds and silently proceeds in release builds. + #[test] + #[cfg_attr(debug_assertions, should_panic)] + fn rejects_partitioned_operation_without_target() { + let item_ref = + ItemReference::from_name(&test_container(), PartitionKey::from("pk1"), "doc1"); + let resource_ref: CosmosResourceReference = item_ref.into(); + let _op = CosmosOperation::new(OperationType::Create, resource_ref, OperationTarget::None); + } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs index b51af12e8f9..83f480df6a8 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs @@ -349,6 +349,14 @@ impl ResourceType { } } + /// Returns true if this resource type is partitioned (requires a partition key to access it). + pub fn is_partitioned(self) -> bool { + matches!( + self, + ResourceType::Document // Attachment/Conflict not yet supported + ) + } + /// Returns true if this resource type is metadata (not data plane items). pub fn is_metadata(self) -> bool { matches!( diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs index 0670a88ec46..63fbbe7dea4 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/operation_target.rs @@ -34,3 +34,10 @@ pub enum OperationTarget { /// container key space ([`FeedRange::full()`]). FeedRange(FeedRange), } + +impl OperationTarget { + /// Returns `true` if the target has a partition reference (i.e., it is not [`None`](Self::None)). + pub fn has_partition_reference(&self) -> bool { + !matches!(self, OperationTarget::None) + } +} From f84bf28ab76330221b92e2d463e36216f167787a Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Thu, 7 May 2026 20:25:32 -0700 Subject: [PATCH 19/29] Separate planning and execution of operations --- .../src/clients/container_client.rs | 99 ++++++++------- .../src/clients/cosmos_client.rs | 11 +- .../src/clients/database_client.rs | 33 ++--- .../src/clients/offers_client.rs | 15 ++- .../azure_data_cosmos/src/driver_bridge.rs | 27 +++- .../azure_data_cosmos/src/query/executor.rs | 11 +- .../benches/point_read.rs | 2 + .../azure_data_cosmos_benchmarks/src/lib.rs | 5 +- .../src/driver/cosmos_driver.rs | 119 ++++++++++++++---- .../src/models/cosmos_operation.rs | 6 +- .../emulator_tests/driver_backup_endpoints.rs | 3 +- .../tests/framework/test_client.rs | 22 ++-- .../tests/multi_region_failover.rs | 2 + 13 files changed, 243 insertions(+), 112 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 68f05183993..14e6bde2ad7 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -101,11 +101,12 @@ impl ContainerClient { ) -> azure_core::Result> { let operation = CosmosOperation::read_container(self.container_ref.clone()); - let driver_response = self - .context - .driver - .execute_operation(operation, OperationOptions::default()) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + OperationOptions::default(), + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -155,11 +156,12 @@ impl ContainerClient { operation_options.content_response_on_write = Some(azure_data_cosmos_driver::options::ContentResponseOnWrite::Enabled); - let driver_response = self - .context - .driver - .execute_operation(operation, operation_options) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + operation_options, + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -247,11 +249,12 @@ impl ContainerClient { ) -> azure_core::Result> { let operation = CosmosOperation::delete_container(self.container_ref.clone()); - let driver_response = self - .context - .driver - .execute_operation(operation, OperationOptions::default()) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + OperationOptions::default(), + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -345,11 +348,12 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = self - .context - .driver - .execute_operation(operation, options.operation) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + options.operation, + ) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -443,11 +447,12 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = self - .context - .driver - .execute_operation(operation, options.operation) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + options.operation, + ) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -545,11 +550,12 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = self - .context - .driver - .execute_operation(operation, options.operation) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + options.operation, + ) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -605,11 +611,12 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = self - .context - .driver - .execute_operation(operation, options.operation) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + options.operation, + ) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -657,11 +664,12 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = self - .context - .driver - .execute_operation(operation, options.operation) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + options.operation, + ) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -811,11 +819,12 @@ impl ContainerClient { CosmosOperation::batch(self.container_ref.clone(), driver_pk).with_body(body); let operation = apply_batch_options(operation, &options); - let driver_response = self - .context - .driver - .execute_operation(operation, options.operation) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + options.operation, + ) + .await?; Ok(BatchResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs index 324d1f5a622..de33ec10735 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs @@ -170,11 +170,12 @@ impl CosmosClient { operation_options.content_response_on_write = Some(azure_data_cosmos_driver::options::ContentResponseOnWrite::Enabled); - let driver_response = self - .context - .driver - .execute_operation(operation, operation_options) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + operation_options, + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs index bdb7fccf150..93428ad25d1 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs @@ -84,11 +84,12 @@ impl DatabaseClient { ) -> azure_core::Result> { let operation = CosmosOperation::read_database(self.database_ref.clone()); - let driver_response = self - .context - .driver - .execute_operation(operation, OperationOptions::default()) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + OperationOptions::default(), + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -169,11 +170,12 @@ impl DatabaseClient { operation_options.content_response_on_write = Some(azure_data_cosmos_driver::options::ContentResponseOnWrite::Enabled); - let driver_response = self - .context - .driver - .execute_operation(operation, operation_options) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + operation_options, + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -193,11 +195,12 @@ impl DatabaseClient { ) -> azure_core::Result> { let operation = CosmosOperation::delete_database(self.database_ref.clone()); - let driver_response = self - .context - .driver - .execute_operation(operation, OperationOptions::default()) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &self.context.driver, + operation, + OperationOptions::default(), + ) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs index 5e88ab4a173..6551a6d3813 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs @@ -38,7 +38,8 @@ pub(crate) async fn find_offer( headers.insert(CONTENT_TYPE, HeaderValue::from("application/query+json")); let options = OperationOptions::default().with_custom_headers(headers); - let driver_response = driver.execute_operation(operation, options).await?; + let driver_response = + crate::driver_bridge::execute_point_operation(&driver, operation, options).await?; tracing::debug!( activity_id = ?driver_response.headers().activity_id, request_charge = ?driver_response.headers().request_charge, @@ -55,9 +56,12 @@ pub(crate) async fn read_offer_by_id( offer_id: &str, ) -> azure_core::Result> { let operation = CosmosOperation::read_offer(account.clone(), offer_id.to_owned()); - let driver_response = driver - .execute_operation(operation, OperationOptions::default()) - .await?; + let driver_response = crate::driver_bridge::execute_point_operation( + &driver, + operation, + OperationOptions::default(), + ) + .await?; Ok(crate::driver_bridge::driver_response_to_cosmos_response( driver_response, )) @@ -105,7 +109,8 @@ pub(crate) async fn begin_replace( opts }; - let driver_response = driver.execute_operation(operation, replace_options).await?; + let driver_response = + crate::driver_bridge::execute_point_operation(&driver, operation, replace_options).await?; let response = crate::driver_bridge::driver_response_to_cosmos_response(driver_response); diff --git a/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs b/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs index 2c842808a19..5f5bdf776d9 100644 --- a/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs +++ b/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs @@ -11,7 +11,32 @@ use azure_core::{ http::{headers::Headers, response::Response, RawResponse, StatusCode}, Bytes, }; -use azure_data_cosmos_driver::models::{CosmosResponse as DriverResponse, CosmosResponseHeaders}; +use azure_data_cosmos_driver::{ + models::{CosmosOperation, CosmosResponse as DriverResponse, CosmosResponseHeaders}, + options::OperationOptions as DriverOperationOptions, + CosmosDriver, +}; + +/// Executes a point operation through the driver, returning the response. +/// +/// Convenience wrapper that plans and executes in one call, asserting that the +/// pipeline produces exactly one response. Used for all single-response +/// operations (reads, writes, metadata calls) in the SDK layer. +pub(crate) async fn execute_point_operation( + driver: &CosmosDriver, + operation: CosmosOperation, + options: DriverOperationOptions, +) -> azure_core::Result { + driver + .execute_operation(operation, options, None) + .await? + .ok_or_else(|| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "point operation completed without producing a response", + ) + }) +} use crate::{ constants::{ diff --git a/sdk/cosmos/azure_data_cosmos/src/query/executor.rs b/sdk/cosmos/azure_data_cosmos/src/query/executor.rs index bf3d8b0fac7..11d79fa4c24 100644 --- a/sdk/cosmos/azure_data_cosmos/src/query/executor.rs +++ b/sdk/cosmos/azure_data_cosmos/src/query/executor.rs @@ -118,7 +118,16 @@ impl QueryExecutor { let op_options = self.base_options.clone().with_custom_headers(headers); // Execute through the driver - let driver_response = self.driver.execute_operation(operation, op_options).await?; + let driver_response = self + .driver + .execute_operation(operation, op_options, None) + .await? + .ok_or_else(|| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "query operation completed without producing a response", + ) + })?; // Bridge driver response to SDK types let cosmos_response = diff --git a/sdk/cosmos/azure_data_cosmos_benchmarks/benches/point_read.rs b/sdk/cosmos/azure_data_cosmos_benchmarks/benches/point_read.rs index 3dd9f07c61c..bdecc20fc11 100644 --- a/sdk/cosmos/azure_data_cosmos_benchmarks/benches/point_read.rs +++ b/sdk/cosmos/azure_data_cosmos_benchmarks/benches/point_read.rs @@ -50,6 +50,7 @@ fn bench_point_read(c: &mut Criterion) { .execute_operation( CosmosOperation::read_item(item_ref.clone()), OperationOptions::default(), + None, ) .await .expect("execute_operation failed") @@ -67,6 +68,7 @@ fn bench_point_read(c: &mut Criterion) { .execute_operation( CosmosOperation::read_item(item_ref.clone()), OperationOptions::default(), + None, ) .await .expect("execute_operation failed") diff --git a/sdk/cosmos/azure_data_cosmos_benchmarks/src/lib.rs b/sdk/cosmos/azure_data_cosmos_benchmarks/src/lib.rs index a52b728396b..5f99fef3d38 100644 --- a/sdk/cosmos/azure_data_cosmos_benchmarks/src/lib.rs +++ b/sdk/cosmos/azure_data_cosmos_benchmarks/src/lib.rs @@ -272,6 +272,7 @@ pub async fn setup_live() -> (Arc, ItemReference) { .execute_operation( CosmosOperation::create_database(account.clone()).with_body(db_body.into_bytes()), OperationOptions::default(), + None, ) .await, ) @@ -290,6 +291,7 @@ pub async fn setup_live() -> (Arc, ItemReference) { CosmosOperation::create_container(database_ref) .with_body(container_body.into_bytes()), OperationOptions::default(), + None, ) .await, ) @@ -313,6 +315,7 @@ pub async fn setup_live() -> (Arc, ItemReference) { .execute_operation( CosmosOperation::create_item(item_ref).with_body(item_body.into_bytes()), OperationOptions::default(), + None, ) .await, ) @@ -332,7 +335,7 @@ pub async fn setup_live() -> (Arc, ItemReference) { /// Used during setup to ignore "resource already exists" responses when /// creating the benchmark database, container, and item. fn ignore_conflict( - result: azure_core::Result, + result: azure_core::Result>, ) -> azure_core::Result<()> { match result { Ok(_) => Ok(()), diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index d3bbc1d18c6..6be96400263 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -682,7 +682,7 @@ impl CosmosDriver { let options = OperationOptions::default(); let db_result = self - .execute_operation( + .execute_point_operation( CosmosOperation::read_database(db_ref.clone()), options.clone(), ) @@ -697,7 +697,7 @@ impl CosmosDriver { })?; let container_result = self - .execute_operation( + .execute_point_operation( CosmosOperation::read_container_by_name(db_ref, container_name.to_owned()), options, ) @@ -734,7 +734,7 @@ impl CosmosDriver { let options = OperationOptions::default(); let db_result = self - .execute_operation( + .execute_point_operation( CosmosOperation::read_database(db_ref.clone()), options.clone(), ) @@ -748,7 +748,7 @@ impl CosmosDriver { .unwrap_or_else(|| db_rid.to_owned()); let container_result = self - .execute_operation( + .execute_point_operation( CosmosOperation::read_container_by_rid(db_ref, container_rid.to_owned()), options, ) @@ -986,23 +986,30 @@ impl CosmosDriver { /// Executes a Cosmos DB operation. /// - /// This method computes effective options by merging the provided operation options - /// with driver and runtime defaults, then executes the operation. + /// This method advances the operation by one page. If a `plan` is provided, + /// the operation uses that plan's pipeline to execute the next page. If + /// `plan` is `None`, the operation is planned first (via + /// [`plan_operation`](Self::plan_operation)) and then the first page is + /// executed. /// /// # Parameters /// - /// - `operation`: The operation to execute - /// - `options`: Operation-specific options that override driver and runtime defaults + /// - `operation`: The operation to execute. + /// - `options`: Operation-specific options that override driver and runtime defaults. + /// - `plan`: An optional mutable reference to a pre-built [`OperationPlan`]. + /// Pass `Some` to advance a multi-page feed pipeline. Pass `None` to plan + /// and execute in a single call (the common path for point operations). /// /// # Returns /// - /// Returns a [`crate::models::CosmosResponse`] on success. + /// Returns `Ok(Some(response))` when a page of results is produced, or + /// `Ok(None)` when the pipeline is fully drained (no more pages). /// /// # Errors /// /// Returns an error if: - /// - The account has no authentication configured - /// - The resource reference cannot produce a valid path + /// - The driver has not been initialized + /// - Planning fails (e.g. invalid operation target, backend query plan error) /// - The HTTP request fails /// /// # Example @@ -1023,12 +1030,12 @@ impl CosmosDriver { /// /// let driver = runtime.get_or_create_driver(account, None).await?; /// - /// // Execute operations with operation-specific options that override defaults + /// // Point operation: plan and execute in one call. /// let options = OperationOptionsBuilder::new() /// .with_content_response_on_write(ContentResponseOnWrite::Disabled) /// .build(); /// - /// // let result = driver.execute_operation(operation, options).await?; + /// // let result = driver.execute_operation(operation, options, None).await?; /// # Ok(()) /// # } /// ``` @@ -1036,7 +1043,8 @@ impl CosmosDriver { &self, operation: CosmosOperation, options: OperationOptions, - ) -> azure_core::Result { + plan: Option<&mut OperationPlan>, + ) -> azure_core::Result> { if !self.initialized.load(Ordering::Acquire) { let endpoint = AccountEndpoint::from(self.options.account()); return Err(azure_core::Error::with_message( @@ -1049,24 +1057,83 @@ impl CosmosDriver { } tracing::debug!("operation started"); - let mut pipeline = planner::build_trivial_pipeline(&operation)?; - let mut executor = DriverRequestExecutor { driver: self, options: &options, }; - let mut topology = StubTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); - match pipeline.next_page(&mut context).await? { - Some(response) => Ok(response), - None => Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "request dataflow pipeline completed without emitting a response", - )), + match plan { + Some(plan) => { + // Caller provided a pre-built plan: use its pipeline with a real + // topology provider for split recovery. + let container = operation.container(); + match container { + Some(container_ref) => { + let mut topology = CachedTopologyProvider::new( + &self.pk_range_cache, + container_ref.clone(), + |container, continuation| { + self.fetch_partition_key_ranges(container, continuation) + }, + ); + let mut context = PipelineContext::new(&mut executor, &mut topology); + plan.pipeline.next_page(&mut context).await + } + None => { + // Non-container operations (metadata, etc.) don't need topology. + let mut topology = StubTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + plan.pipeline.next_page(&mut context).await + } + } + } + None => { + // No plan provided: plan the operation first, then execute one page. + let container = operation.container().cloned(); + let mut owned_plan = self.plan_operation(operation, &options).await?; + + match container { + Some(container_ref) => { + let mut topology = CachedTopologyProvider::new( + &self.pk_range_cache, + container_ref, + |container, continuation| { + self.fetch_partition_key_ranges(container, continuation) + }, + ); + let mut context = PipelineContext::new(&mut executor, &mut topology); + owned_plan.pipeline.next_page(&mut context).await + } + None => { + let mut topology = StubTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + owned_plan.pipeline.next_page(&mut context).await + } + } + } } } + /// Convenience helper for internal point operations. + /// + /// Plans and executes in one call, asserting that a response is produced. + /// Used by internal metadata-fetching helpers that always expect a single + /// response page. + async fn execute_point_operation( + &self, + operation: CosmosOperation, + options: OperationOptions, + ) -> azure_core::Result { + self.execute_operation(operation, options, None) + .await? + .ok_or_else(|| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "point operation completed without producing a response", + ) + }) + } + async fn execute_operation_direct( &self, operation: &CosmosOperation, @@ -1206,7 +1273,7 @@ impl CosmosDriver { /// // Use the resolved container for item operations /// let item = ItemReference::from_name(&container, PartitionKey::from("pk1"), "doc1"); /// let result = driver - /// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default()) + /// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default(), None) /// .await?; /// # Ok(()) /// # } @@ -2111,7 +2178,7 @@ mod tests { fn _assert_execute_operation_future_is_send() { fn assert_send(_: T) {} let driver: &CosmosDriver = todo!(); - assert_send(driver.execute_operation(todo!(), todo!())); + assert_send(driver.execute_operation(todo!(), todo!(), todo!())); } // Account properties with two readable locations for regional fallback tests. diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index 9b41714c56e..af383f3af55 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -49,7 +49,7 @@ use std::borrow::Cow; /// // 3. Build and execute item operations /// let item = ItemReference::from_name(&container, PartitionKey::from("pk1"), "doc1"); /// let result = driver -/// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default()) +/// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default(), None) /// .await?; /// # Ok(()) /// # } @@ -347,6 +347,7 @@ impl CosmosOperation { /// .execute_operation( /// CosmosOperation::delete_container(container), /// OperationOptions::default(), + /// None, /// ) /// .await?; /// # Ok(()) @@ -433,6 +434,7 @@ impl CosmosOperation { /// CosmosOperation::create_item(item) /// .with_body(br#"{"id": "doc1", "pk": "pk-value", "data": "hello"}"#.to_vec()), /// OperationOptions::default(), + /// None, /// ) /// .await?; /// # Ok(()) @@ -474,7 +476,7 @@ impl CosmosOperation { /// /// let item = ItemReference::from_name(&container, PartitionKey::from("pk-value"), "doc1"); /// let result = driver - /// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default()) + /// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default(), None) /// .await?; /// # Ok(()) /// # } diff --git a/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs b/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs index 6706a24fa11..765eb23866c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs @@ -81,7 +81,7 @@ async fn driver_operations_work_after_backup_boot() -> Result<(), Box let operation = CosmosOperation::create_database(account.clone()).with_body(body.into_bytes()); let result = driver - .execute_operation(operation, OperationOptions::default()) + .execute_operation(operation, OperationOptions::default(), None) .await; assert!( @@ -96,6 +96,7 @@ async fn driver_operations_work_after_backup_boot() -> Result<(), Box .execute_operation( CosmosOperation::delete_database(db_ref), OperationOptions::default(), + None, ) .await; diff --git a/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs b/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs index 6657157d9e2..e195a2af02f 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs @@ -255,11 +255,11 @@ impl DriverTestRunContext { .with_body(body.into_bytes()); let result = driver - .execute_operation(operation, OperationOptions::default()) + .execute_operation(operation, OperationOptions::default(), None) .await?; // Check for success status (201 Created) - let status = result.diagnostics().status(); + let status = result.as_ref().and_then(|r| r.diagnostics().status()); if !status.map(|s| s.is_success()).unwrap_or(false) { return Err(format!("Failed to create database, status: {:?}", status).into()); } @@ -284,11 +284,11 @@ impl DriverTestRunContext { let operation = CosmosOperation::delete_database(database.clone()); let result = driver - .execute_operation(operation, OperationOptions::default()) + .execute_operation(operation, OperationOptions::default(), None) .await?; // Check for success status (204 No Content) - let status = result.diagnostics().status(); + let status = result.as_ref().and_then(|r| r.diagnostics().status()); if !status.map(|s| s.is_success()).unwrap_or(false) { return Err(format!("Failed to delete database, status: {:?}", status).into()); } @@ -317,11 +317,11 @@ impl DriverTestRunContext { CosmosOperation::create_container(database.clone()).with_body(body.into_bytes()); let result = driver - .execute_operation(operation, OperationOptions::default()) + .execute_operation(operation, OperationOptions::default(), None) .await?; // Check for success status (201 Created) - let status = result.diagnostics().status(); + let status = result.as_ref().and_then(|r| r.diagnostics().status()); if !status.map(|s| s.is_success()).unwrap_or(false) { return Err(format!("Failed to create container, status: {:?}", status).into()); } @@ -353,8 +353,9 @@ impl DriverTestRunContext { let operation = CosmosOperation::create_item(item_ref).with_body(body.to_vec()); let result = driver - .execute_operation(operation, OperationOptions::default()) - .await?; + .execute_operation(operation, OperationOptions::default(), None) + .await? + .ok_or("create_item produced no response")?; Ok(result) } @@ -377,8 +378,9 @@ impl DriverTestRunContext { let operation = CosmosOperation::read_item(item_ref); let result = driver - .execute_operation(operation, OperationOptions::default()) - .await?; + .execute_operation(operation, OperationOptions::default(), None) + .await? + .ok_or("read_item produced no response")?; Ok(result) } diff --git a/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs b/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs index fe6d2cad804..6700da87c1c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs @@ -57,6 +57,7 @@ async fn write_forbidden_triggers_refresh_and_failover() { .execute_operation( CosmosOperation::read_database(db_ref), OperationOptions::default(), + None, ) .await; } @@ -91,6 +92,7 @@ async fn session_not_available_retries_across_locations() { .execute_operation( CosmosOperation::read_database(db_ref), OperationOptions::default(), + None, ) .await; } From 25f7eeda47b53c134349fb41b33a46187445333e Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Fri, 8 May 2026 07:53:57 -0700 Subject: [PATCH 20/29] Integrate 2-stage planning for non-point operations --- .../src/clients/container_client.rs | 35 ++-- sdk/cosmos/azure_data_cosmos/src/feed.rs | 16 +- .../azure_data_cosmos/src/options/mod.rs | 13 ++ .../src/driver/cosmos_driver.rs | 165 +++++++++--------- .../src/models/cosmos_operation.rs | 10 +- .../emulator_tests/driver_backup_endpoints.rs | 5 +- .../tests/framework/test_client.rs | 39 ++--- .../tests/multi_region_failover.rs | 6 +- 8 files changed, 141 insertions(+), 148 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 14e6bde2ad7..fe9044609c5 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -13,8 +13,8 @@ use crate::{ }, resource_context::ResourceLink, transactional_batch::TransactionalBatch, - DeleteContainerOptions, FeedItemIterator, ItemReadOptions, ItemWriteOptions, PartitionKey, - Query, ReplaceContainerOptions, ThroughputOptions, + DeleteContainerOptions, FeedItemIterator, FeedPageIterator, ItemReadOptions, ItemWriteOptions, + PartitionKey, Query, ReplaceContainerOptions, ThroughputOptions, }; use std::sync::Arc; @@ -736,7 +736,7 @@ impl ContainerClient { /// ``` /// /// See [`PartitionKey`](crate::PartitionKey) for more information on how to specify a partition key, and [`Query`] for more information on how to specify a query. - pub fn query_items( + pub async fn query_items( &self, query: impl Into, partition_key: impl Into, @@ -748,21 +748,22 @@ impl ContainerClient { let driver_pk = partition_key.into_driver_partition_key(); let container_ref = self.container_ref.clone(); - let factory = move || { - CosmosOperation::query_items( - container_ref.clone(), - OperationTarget::PartitionKey(driver_pk.clone()), - ) - }; - crate::query::executor::QueryExecutor::new( - self.context.driver.clone(), - factory, - query, - options.operation, - options.session_token, - ) - .into_stream() + // The first operation to execute in the query items flow. + // This holds the session token provided by the user, if any. + let mut initial_operation = CosmosOperation::query_items( + container_ref.clone(), + OperationTarget::PartitionKey(driver_pk.clone()), + ); + if let Some(token) = options.session_token { + initial_operation = initial_operation.with_session_token(token); + } + let plan = self + .context + .driver + .plan_operation(&initial_operation, &options.operation) + .await?; + Ok(FeedPageIterator::new(options.operation, plan)) } /// Executes a transactional batch of operations. diff --git a/sdk/cosmos/azure_data_cosmos/src/feed.rs b/sdk/cosmos/azure_data_cosmos/src/feed.rs index b1706e31c9d..892f9331640 100644 --- a/sdk/cosmos/azure_data_cosmos/src/feed.rs +++ b/sdk/cosmos/azure_data_cosmos/src/feed.rs @@ -1,13 +1,15 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -use std::{pin::Pin, task}; +use std::{pin::Pin, sync::Arc, task}; use azure_core::http::{ headers::Headers, pager::{PagerContinuation, PagerResult}, }; -use azure_data_cosmos_driver::models::CosmosResponseHeaders; +use azure_data_cosmos_driver::{ + models::CosmosResponseHeaders, options::OperationOptions, CosmosDriver, OperationPlan, +}; use futures::stream::BoxStream; use futures::Stream; use serde::{de::DeserializeOwned, Deserialize}; @@ -273,9 +275,7 @@ pub struct FeedItemIterator { impl FeedItemIterator { /// Creates a new `FeedItemIterator` from a stream of pages. - pub(crate) fn new( - stream: impl Stream>> + Send + 'static, - ) -> Self { + pub(crate) fn new(plan: OperationPlan) -> Self { Self { pages: Box::pin(stream), current: None, @@ -322,6 +322,12 @@ impl Stream for FeedItemIterator { pub struct FeedPageIterator(BoxStream<'static, azure_core::Result>>); +impl FeedPageIterator { + pub fn new(driver: Arc, options: OperationOptions, plan: OperationPlan) -> Self { + driver.execute_operation() + } +} + impl Stream for FeedPageIterator { type Item = azure_core::Result>; diff --git a/sdk/cosmos/azure_data_cosmos/src/options/mod.rs b/sdk/cosmos/azure_data_cosmos/src/options/mod.rs index c19ec76e12c..206ea5f8d35 100644 --- a/sdk/cosmos/azure_data_cosmos/src/options/mod.rs +++ b/sdk/cosmos/azure_data_cosmos/src/options/mod.rs @@ -266,6 +266,13 @@ pub struct QueryOptions { /// Session token for session-consistent queries. pub session_token: Option, + + /// Maximum number of items to return per page. + /// + /// When set, the server will return at most this many items in each response page. + /// This is useful for controlling memory usage and for testing pagination behavior. + /// If not set, the server uses its default page size. + pub max_item_count: Option, } impl QueryOptions { @@ -280,6 +287,12 @@ impl QueryOptions { self.operation = operation; self } + + /// Sets the maximum number of items to return per page. + pub fn with_max_item_count(mut self, max_item_count: u32) -> Self { + self.max_item_count = Some(max_item_count); + self + } } /// Options to be passed to [`ContainerClient::read()`](crate::clients::ContainerClient::read()). diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 6be96400263..29c0ae9ae5c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -986,19 +986,21 @@ impl CosmosDriver { /// Executes a Cosmos DB operation. /// - /// This method advances the operation by one page. If a `plan` is provided, - /// the operation uses that plan's pipeline to execute the next page. If - /// `plan` is `None`, the operation is planned first (via - /// [`plan_operation`](Self::plan_operation)) and then the first page is - /// executed. + /// This method executes an operation by planning it first and then immediately + /// executing one page. This is sufficient for operations with trivial plans, + /// such as point operations and single-partition queries. + /// However, if planning is complicated and multiple pages are going to be requested, + /// in that case, the caller should use the [`plan_operation`](Self::plan_operation) + /// method to build a [`OperationPlan`] and then call [`execute_plan`](Self::execute_plan) + /// for each page of the plan. + /// Retaining the [`OperationPlan`] allows the caller to resume execution from a + /// previous page, maintaining all state, and avoiding unnecessary replanning + /// and continuation token management. /// /// # Parameters /// /// - `operation`: The operation to execute. /// - `options`: Operation-specific options that override driver and runtime defaults. - /// - `plan`: An optional mutable reference to a pre-built [`OperationPlan`]. - /// Pass `Some` to advance a multi-page feed pipeline. Pass `None` to plan - /// and execute in a single call (the common path for point operations). /// /// # Returns /// @@ -1043,7 +1045,47 @@ impl CosmosDriver { &self, operation: CosmosOperation, options: OperationOptions, - plan: Option<&mut OperationPlan>, + ) -> azure_core::Result> { + let mut plan = self.plan_operation(&operation, &options).await?; + self.execute_plan(&mut plan, operation.container().cloned(), options) + .await + } + + /// Executes a point operation (read/write item, read database, etc.) without a pre-planned pipeline. + /// + /// This is a convenience method around [`execute_operation`] that asserts at debug-time that the operation + /// does not return an empty page. + pub async fn execute_point_operation( + &self, + operation: CosmosOperation, + options: OperationOptions, + ) -> azure_core::Result { + match self.execute_operation(operation, options).await { + Ok(Some(r)) => Ok(r), + Ok(None) => { + if cfg!(debug_assertions) { + panic!("point operation returned an empty page") + } + Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "internal error: point operation returned an empty page", + )) + } + Err(e) => Err(e), + } + } + + /// Executes a single page of a pre-planned operation using the given plan and options. + /// + /// This function mutates the plan in place to account for any changes that occur during execution + /// (e.g. topology repairs, advancing page state, etc.). + /// After this returns, the plan may be executed again to fetch the next page of results, if any. + /// Once this returns `None`, there are no more pages to fetch, and the operation is complete. + pub async fn execute_plan( + &self, + plan: &mut OperationPlan, + container: Option, + options: OperationOptions, ) -> azure_core::Result> { if !self.initialized.load(Ordering::Acquire) { let endpoint = AccountEndpoint::from(self.options.account()); @@ -1062,76 +1104,18 @@ impl CosmosDriver { options: &options, }; - match plan { - Some(plan) => { - // Caller provided a pre-built plan: use its pipeline with a real - // topology provider for split recovery. - let container = operation.container(); - match container { - Some(container_ref) => { - let mut topology = CachedTopologyProvider::new( - &self.pk_range_cache, - container_ref.clone(), - |container, continuation| { - self.fetch_partition_key_ranges(container, continuation) - }, - ); - let mut context = PipelineContext::new(&mut executor, &mut topology); - plan.pipeline.next_page(&mut context).await - } - None => { - // Non-container operations (metadata, etc.) don't need topology. - let mut topology = StubTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); - plan.pipeline.next_page(&mut context).await - } - } - } - None => { - // No plan provided: plan the operation first, then execute one page. - let container = operation.container().cloned(); - let mut owned_plan = self.plan_operation(operation, &options).await?; - - match container { - Some(container_ref) => { - let mut topology = CachedTopologyProvider::new( - &self.pk_range_cache, - container_ref, - |container, continuation| { - self.fetch_partition_key_ranges(container, continuation) - }, - ); - let mut context = PipelineContext::new(&mut executor, &mut topology); - owned_plan.pipeline.next_page(&mut context).await - } - None => { - let mut topology = StubTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); - owned_plan.pipeline.next_page(&mut context).await - } - } - } - } - } + let mut topology = match container { + Some(c) => Box::new(CachedTopologyProvider::new( + &self.pk_range_cache, + c, + |container, continuation| self.fetch_partition_key_ranges(container, continuation), + )) as Box, + None => Box::new(StubTopologyProvider) as Box, + }; - /// Convenience helper for internal point operations. - /// - /// Plans and executes in one call, asserting that a response is produced. - /// Used by internal metadata-fetching helpers that always expect a single - /// response page. - async fn execute_point_operation( - &self, - operation: CosmosOperation, - options: OperationOptions, - ) -> azure_core::Result { - self.execute_operation(operation, options, None) - .await? - .ok_or_else(|| { - azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "point operation completed without producing a response", - ) - }) + let mut context = PipelineContext::new(&mut executor, topology.as_mut()); + + plan.pipeline.next_page(&mut context).await } async fn execute_operation_direct( @@ -1273,7 +1257,7 @@ impl CosmosDriver { /// // Use the resolved container for item operations /// let item = ItemReference::from_name(&container, PartitionKey::from("pk1"), "doc1"); /// let result = driver - /// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default(), None) + /// .execute_point_operation(CosmosOperation::read_item(item), OperationOptions::default()) /// .await?; /// # Ok(()) /// # } @@ -1344,9 +1328,20 @@ impl CosmosDriver { /// query plan from the backend and builds a fan-out pipeline. pub async fn plan_operation( &self, - operation: CosmosOperation, + operation: &CosmosOperation, options: &OperationOptions, ) -> azure_core::Result { + if !self.initialized.load(Ordering::Acquire) { + let endpoint = AccountEndpoint::from(self.options.account()); + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + format!( + "CosmosDriver for {endpoint} has not been initialized; call initialize() or \ + use CosmosDriverRuntime::get_or_create_driver() which initializes automatically" + ), + )); + } + // Trivial plan: anything that isn't a cross-partition query. if operation.is_trivial() { let pipeline = planner::build_trivial_pipeline(&operation)?; @@ -2170,15 +2165,17 @@ mod tests { ); } - /// Compile-time assertion that the `execute_operation` future is `Send`. + /// Compile-time assertion that functions are send. /// /// This function is never called; it only needs to compile. - /// If the future returned by `execute_operation` is not `Send`, compilation will fail. #[allow(dead_code, unreachable_code, unused_variables)] - fn _assert_execute_operation_future_is_send() { + fn _assert_functions_are_send() { fn assert_send(_: T) {} let driver: &CosmosDriver = todo!(); - assert_send(driver.execute_operation(todo!(), todo!(), todo!())); + assert_send(driver.execute_operation(todo!(), todo!())); + assert_send(driver.execute_point_operation(todo!(), todo!())); + assert_send(driver.execute_plan(todo!(), todo!(), todo!())); + assert_send(driver.plan_operation(todo!(), todo!())); } // Account properties with two readable locations for regional fallback tests. diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index af383f3af55..5c16f073042 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -49,7 +49,7 @@ use std::borrow::Cow; /// // 3. Build and execute item operations /// let item = ItemReference::from_name(&container, PartitionKey::from("pk1"), "doc1"); /// let result = driver -/// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default(), None) +/// .execute_point_operation(CosmosOperation::read_item(item), OperationOptions::default()) /// .await?; /// # Ok(()) /// # } @@ -344,10 +344,9 @@ impl CosmosOperation { /// let container = driver.resolve_container("my-database", "my-container").await?; /// /// let result = driver - /// .execute_operation( + /// .execute_point_operation( /// CosmosOperation::delete_container(container), /// OperationOptions::default(), - /// None, /// ) /// .await?; /// # Ok(()) @@ -430,11 +429,10 @@ impl CosmosOperation { /// /// let item = ItemReference::from_name(&container, PartitionKey::from("pk-value"), "doc1"); /// let result = driver - /// .execute_operation( + /// .execute_point_operation( /// CosmosOperation::create_item(item) /// .with_body(br#"{"id": "doc1", "pk": "pk-value", "data": "hello"}"#.to_vec()), /// OperationOptions::default(), - /// None, /// ) /// .await?; /// # Ok(()) @@ -476,7 +474,7 @@ impl CosmosOperation { /// /// let item = ItemReference::from_name(&container, PartitionKey::from("pk-value"), "doc1"); /// let result = driver - /// .execute_operation(CosmosOperation::read_item(item), OperationOptions::default(), None) + /// .execute_point_operation(CosmosOperation::read_item(item), OperationOptions::default()) /// .await?; /// # Ok(()) /// # } diff --git a/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs b/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs index 765eb23866c..f086dff6775 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/tests/emulator_tests/driver_backup_endpoints.rs @@ -81,7 +81,7 @@ async fn driver_operations_work_after_backup_boot() -> Result<(), Box let operation = CosmosOperation::create_database(account.clone()).with_body(body.into_bytes()); let result = driver - .execute_operation(operation, OperationOptions::default(), None) + .execute_point_operation(operation, OperationOptions::default()) .await; assert!( @@ -93,10 +93,9 @@ async fn driver_operations_work_after_backup_boot() -> Result<(), Box // Cleanup let db_ref = DatabaseReference::from_name(account, db_name); let _ = driver - .execute_operation( + .execute_point_operation( CosmosOperation::delete_database(db_ref), OperationOptions::default(), - None, ) .await; diff --git a/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs b/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs index e195a2af02f..4ae6019642d 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/tests/framework/test_client.rs @@ -254,16 +254,10 @@ impl DriverTestRunContext { let operation = CosmosOperation::create_database(self.client.account.clone()) .with_body(body.into_bytes()); - let result = driver - .execute_operation(operation, OperationOptions::default(), None) + driver + .execute_point_operation(operation, OperationOptions::default()) .await?; - // Check for success status (201 Created) - let status = result.as_ref().and_then(|r| r.diagnostics().status()); - if !status.map(|s| s.is_success()).unwrap_or(false) { - return Err(format!("Failed to create database, status: {:?}", status).into()); - } - Ok(DatabaseReference::from_name( self.client.account.clone(), db_name.to_string(), @@ -283,16 +277,10 @@ impl DriverTestRunContext { let operation = CosmosOperation::delete_database(database.clone()); - let result = driver - .execute_operation(operation, OperationOptions::default(), None) + driver + .execute_point_operation(operation, OperationOptions::default()) .await?; - // Check for success status (204 No Content) - let status = result.as_ref().and_then(|r| r.diagnostics().status()); - if !status.map(|s| s.is_success()).unwrap_or(false) { - return Err(format!("Failed to delete database, status: {:?}", status).into()); - } - Ok(()) } @@ -316,15 +304,10 @@ impl DriverTestRunContext { let operation = CosmosOperation::create_container(database.clone()).with_body(body.into_bytes()); - let result = driver - .execute_operation(operation, OperationOptions::default(), None) + driver + .execute_point_operation(operation, OperationOptions::default()) .await?; - // Check for success status (201 Created) - let status = result.as_ref().and_then(|r| r.diagnostics().status()); - if !status.map(|s| s.is_success()).unwrap_or(false) { - return Err(format!("Failed to create container, status: {:?}", status).into()); - } let db_name = database .name() .ok_or_else(|| "database reference must be name-based".to_string())?; @@ -353,9 +336,8 @@ impl DriverTestRunContext { let operation = CosmosOperation::create_item(item_ref).with_body(body.to_vec()); let result = driver - .execute_operation(operation, OperationOptions::default(), None) - .await? - .ok_or("create_item produced no response")?; + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(result) } @@ -378,9 +360,8 @@ impl DriverTestRunContext { let operation = CosmosOperation::read_item(item_ref); let result = driver - .execute_operation(operation, OperationOptions::default(), None) - .await? - .ok_or("read_item produced no response")?; + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(result) } diff --git a/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs b/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs index 6700da87c1c..22ee8067bd6 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/tests/multi_region_failover.rs @@ -54,10 +54,9 @@ async fn write_forbidden_triggers_refresh_and_failover() { ); let _ = driver - .execute_operation( + .execute_point_operation( CosmosOperation::read_database(db_ref), OperationOptions::default(), - None, ) .await; } @@ -89,10 +88,9 @@ async fn session_not_available_retries_across_locations() { ); let _ = driver - .execute_operation( + .execute_point_operation( CosmosOperation::read_database(db_ref), OperationOptions::default(), - None, ) .await; } From b01ad3c146ef094c8a310e390b82365d10e14b52 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Fri, 8 May 2026 18:34:41 +0000 Subject: [PATCH 21/29] Wire everything up for queries --- .../src/clients/container_client.rs | 109 ++++--- .../src/clients/cosmos_client.rs | 44 +-- .../src/clients/database_client.rs | 68 ++--- .../src/clients/offers_client.rs | 17 +- .../azure_data_cosmos/src/driver_bridge.rs | 27 +- sdk/cosmos/azure_data_cosmos/src/feed.rs | 105 +++++-- .../src/models/cosmos_response.rs | 5 +- .../src/{query/mod.rs => query.rs} | 4 - .../azure_data_cosmos/src/query/executor.rs | 145 --------- .../tests/emulator_tests/cosmos_containers.rs | 24 +- .../tests/emulator_tests/cosmos_databases.rs | 4 +- .../tests/emulator_tests/cosmos_query.rs | 276 +++++++++--------- .../tests/framework/test_client.rs | 7 +- .../src/driver/cosmos_driver.rs | 16 +- .../src/driver/dataflow/planner.rs | 6 +- .../src/driver/dataflow/request.rs | 170 +++++++---- .../src/driver/pipeline/operation_pipeline.rs | 83 ++++-- .../driver/transport/transport_pipeline.rs | 5 +- .../src/models/cosmos_headers.rs | 6 +- 19 files changed, 561 insertions(+), 560 deletions(-) rename sdk/cosmos/azure_data_cosmos/src/{query/mod.rs => query.rs} (99%) delete mode 100644 sdk/cosmos/azure_data_cosmos/src/query/executor.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index fe9044609c5..6a4c2ac70b4 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -101,12 +101,11 @@ impl ContainerClient { ) -> azure_core::Result> { let operation = CosmosOperation::read_container(self.container_ref.clone()); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - OperationOptions::default(), - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -156,12 +155,11 @@ impl ContainerClient { operation_options.content_response_on_write = Some(azure_data_cosmos_driver::options::ContentResponseOnWrite::Enabled); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - operation_options, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, operation_options) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -249,12 +247,11 @@ impl ContainerClient { ) -> azure_core::Result> { let operation = CosmosOperation::delete_container(self.container_ref.clone()); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - OperationOptions::default(), - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -348,12 +345,11 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - options.operation, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, options.operation) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -447,12 +443,11 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - options.operation, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, options.operation) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -550,12 +545,11 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - options.operation, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, options.operation) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -611,12 +605,11 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - options.operation, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, options.operation) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -664,12 +657,11 @@ impl ContainerClient { let operation = apply_item_options(operation, options.session_token, options.precondition); // Execute through the driver. - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - options.operation, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, options.operation) + .await?; // Bridge the driver response to the SDK response type. Ok(ItemResponse::new( @@ -754,7 +746,8 @@ impl ContainerClient { let mut initial_operation = CosmosOperation::query_items( container_ref.clone(), OperationTarget::PartitionKey(driver_pk.clone()), - ); + ) + .with_body(serde_json::to_vec(&query)?); if let Some(token) = options.session_token { initial_operation = initial_operation.with_session_token(token); } @@ -763,7 +756,12 @@ impl ContainerClient { .driver .plan_operation(&initial_operation, &options.operation) .await?; - Ok(FeedPageIterator::new(options.operation, plan)) + Ok(FeedItemIterator::new( + self.context.driver.clone(), + Some(self.container_ref.clone()), + plan, + options.operation, + )) } /// Executes a transactional batch of operations. @@ -820,12 +818,11 @@ impl ContainerClient { CosmosOperation::batch(self.container_ref.clone(), driver_pk).with_body(body); let operation = apply_batch_options(operation, &options); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - options.operation, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, options.operation) + .await?; Ok(BatchResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs index de33ec10735..fb79f2d066f 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs @@ -117,29 +117,37 @@ impl CosmosClient { /// # async fn doc() -> Result<(), Box> { /// # use azure_data_cosmos::CosmosClient; /// # let client: CosmosClient = panic!("this is a non-running example"); - /// let dbs = client.query_databases( - /// "SELECT * FROM dbs", - /// None)?; + /// let dbs = client + /// .query_databases("SELECT * FROM dbs", None) + /// .await?; /// # } /// ``` /// /// See [`Query`] for more information on how to specify a query. - pub fn query_databases( + pub async fn query_databases( &self, query: impl Into, - _options: Option, + #[allow(unused_variables, reason = "This parameter may be used in the future")] + options: Option, ) -> azure_core::Result> { + let query = query.into(); let account = self.context.driver.account().clone(); - let factory = move || CosmosOperation::query_databases(account.clone()); + let initial_operation = + CosmosOperation::query_databases(account).with_body(serde_json::to_vec(&query)?); + let operation_options = OperationOptions::default(); + + let plan = self + .context + .driver + .plan_operation(&initial_operation, &operation_options) + .await?; - crate::query::executor::QueryExecutor::new( + Ok(FeedItemIterator::new( self.context.driver.clone(), - factory, - query.into(), - Default::default(), None, - ) - .into_stream() + plan, + operation_options, + )) } /// Creates a new database. @@ -170,12 +178,11 @@ impl CosmosClient { operation_options.content_response_on_write = Some(azure_data_cosmos_driver::options::ContentResponseOnWrite::Enabled); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - operation_options, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, operation_options) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -195,6 +202,7 @@ mod tests { fn _assert_futures_are_send() { fn assert_send(_: T) {} let client: &CosmosClient = todo!(); + assert_send(client.query_databases(Query::from("SELECT * FROM dbs"), todo!())); assert_send(client.create_database(todo!(), todo!())); } } diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs index 93428ad25d1..ae9d9d78c78 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs @@ -84,12 +84,11 @@ impl DatabaseClient { ) -> azure_core::Result> { let operation = CosmosOperation::read_database(self.database_ref.clone()); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - OperationOptions::default(), - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -112,33 +111,37 @@ impl DatabaseClient { /// # async fn doc() -> Result<(), Box> { /// # use azure_data_cosmos::clients::DatabaseClient; /// # let db_client: DatabaseClient = panic!("this is a non-running example"); - /// let containers = db_client.query_containers( - /// "SELECT * FROM dbs", - /// None)?; + /// let containers = db_client + /// .query_containers("SELECT * FROM dbs", None) + /// .await?; /// # } /// ``` /// /// See [`Query`] for more information on how to specify a query. #[allow(unused_variables, reason = "This parameter may be used in the future")] - pub fn query_containers( + pub async fn query_containers( &self, query: impl Into, + #[allow(unused_variables, reason = "This parameter may be used in the future")] options: Option, ) -> azure_core::Result> { - let db_ref = DatabaseReference::from_name( - self.context.driver.account().clone(), - self.database_id.clone(), - ); - let factory = move || CosmosOperation::query_containers(db_ref.clone()); + let query = query.into(); + let initial_operation = CosmosOperation::query_containers(self.database_ref.clone()) + .with_body(serde_json::to_vec(&query)?); + let operation_options = OperationOptions::default(); - crate::query::executor::QueryExecutor::new( + let plan = self + .context + .driver + .plan_operation(&initial_operation, &operation_options) + .await?; + + Ok(FeedItemIterator::new( self.context.driver.clone(), - factory, - query.into(), - Default::default(), None, - ) - .into_stream() + plan, + operation_options, + )) } /// Creates a new container. @@ -170,12 +173,11 @@ impl DatabaseClient { operation_options.content_response_on_write = Some(azure_data_cosmos_driver::options::ContentResponseOnWrite::Enabled); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - operation_options, - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, operation_options) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -195,12 +197,11 @@ impl DatabaseClient { ) -> azure_core::Result> { let operation = CosmosOperation::delete_database(self.database_ref.clone()); - let driver_response = crate::driver_bridge::execute_point_operation( - &self.context.driver, - operation, - OperationOptions::default(), - ) - .await?; + let driver_response = self + .context + .driver + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(ResourceResponse::new( crate::driver_bridge::driver_response_to_cosmos_response(driver_response), @@ -297,6 +298,7 @@ mod tests { let client: &DatabaseClient = todo!(); assert_send(client.container_client(todo!())); assert_send(client.read(todo!())); + assert_send(client.query_containers(Query::from("SELECT * FROM c"), todo!())); assert_send(client.create_container(todo!(), todo!())); assert_send(client.delete(todo!())); assert_send(client.read_throughput(todo!())); diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs index 6551a6d3813..de3442f03e5 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/offers_client.rs @@ -38,8 +38,7 @@ pub(crate) async fn find_offer( headers.insert(CONTENT_TYPE, HeaderValue::from("application/query+json")); let options = OperationOptions::default().with_custom_headers(headers); - let driver_response = - crate::driver_bridge::execute_point_operation(&driver, operation, options).await?; + let driver_response = driver.execute_point_operation(operation, options).await?; tracing::debug!( activity_id = ?driver_response.headers().activity_id, request_charge = ?driver_response.headers().request_charge, @@ -56,12 +55,9 @@ pub(crate) async fn read_offer_by_id( offer_id: &str, ) -> azure_core::Result> { let operation = CosmosOperation::read_offer(account.clone(), offer_id.to_owned()); - let driver_response = crate::driver_bridge::execute_point_operation( - &driver, - operation, - OperationOptions::default(), - ) - .await?; + let driver_response = driver + .execute_point_operation(operation, OperationOptions::default()) + .await?; Ok(crate::driver_bridge::driver_response_to_cosmos_response( driver_response, )) @@ -109,8 +105,9 @@ pub(crate) async fn begin_replace( opts }; - let driver_response = - crate::driver_bridge::execute_point_operation(&driver, operation, replace_options).await?; + let driver_response = driver + .execute_point_operation(operation, replace_options) + .await?; let response = crate::driver_bridge::driver_response_to_cosmos_response(driver_response); diff --git a/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs b/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs index 5f5bdf776d9..36fe17d9725 100644 --- a/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs +++ b/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs @@ -17,27 +17,6 @@ use azure_data_cosmos_driver::{ CosmosDriver, }; -/// Executes a point operation through the driver, returning the response. -/// -/// Convenience wrapper that plans and executes in one call, asserting that the -/// pipeline produces exactly one response. Used for all single-response -/// operations (reads, writes, metadata calls) in the SDK layer. -pub(crate) async fn execute_point_operation( - driver: &CosmosDriver, - operation: CosmosOperation, - options: DriverOperationOptions, -) -> azure_core::Result { - driver - .execute_operation(operation, options, None) - .await? - .ok_or_else(|| { - azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "point operation completed without producing a response", - ) - }) -} - use crate::{ constants::{ ACTIVITY_ID, CONTINUATION, COSMOS_INTERNAL_PARTITION_ID, INDEX_METRICS, ITEM_COUNT, @@ -355,11 +334,7 @@ mod tests { ); let rt = tokio::runtime::Runtime::new().unwrap(); - let page = rt - .block_on(QueryFeedPage::::from_response( - cosmos_response, - )) - .unwrap(); + let page = QueryFeedPage::::from_response(cosmos_response).unwrap(); assert_eq!( page.index_metrics(), Some(r#"{"UtilizedSingleIndexes":[]}"#) diff --git a/sdk/cosmos/azure_data_cosmos/src/feed.rs b/sdk/cosmos/azure_data_cosmos/src/feed.rs index 892f9331640..ef504e624cf 100644 --- a/sdk/cosmos/azure_data_cosmos/src/feed.rs +++ b/sdk/cosmos/azure_data_cosmos/src/feed.rs @@ -1,21 +1,23 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -use std::{pin::Pin, sync::Arc, task}; +use std::{pin::Pin, result, sync::Arc, task}; use azure_core::http::{ headers::Headers, pager::{PagerContinuation, PagerResult}, }; use azure_data_cosmos_driver::{ - models::CosmosResponseHeaders, options::OperationOptions, CosmosDriver, OperationPlan, + models::{ContainerReference, CosmosResponseHeaders}, + options::OperationOptions, + CosmosDriver, OperationPlan, }; use futures::stream::BoxStream; use futures::Stream; use serde::{de::DeserializeOwned, Deserialize}; use crate::{ - constants, + constants, driver_bridge, models::{CosmosDiagnostics, CosmosResponse}, SessionToken, }; @@ -238,9 +240,7 @@ pub(crate) struct FeedBody { } impl QueryFeedPage { - pub(crate) async fn from_response( - response: CosmosResponse>, - ) -> azure_core::Result { + pub(crate) fn from_response(response: CosmosResponse>) -> azure_core::Result { let raw_headers = response.headers().clone(); let continuation = raw_headers.get_optional_string(&constants::CONTINUATION); let cosmos_headers = response.cosmos_headers().clone(); @@ -263,6 +263,57 @@ impl QueryFeedPage { } } +fn create_pagination_stream( + driver: Arc, + container: Option, + plan: OperationPlan, + options: OperationOptions, +) -> BoxStream<'static, azure_core::Result>> { + struct State { + driver: Arc, + container: Option, + plan: OperationPlan, + options: OperationOptions, + continuation: Option, + } + let initial_state = State { + driver, + container, + options, + plan, + continuation: None, + }; + let stream = futures::stream::unfold(Some(initial_state), |state| async move { + let Some(mut state) = state else { + return None; // No more pages to fetch + }; + + let result = state + .driver + .execute_plan( + &mut state.plan, + state.container.clone(), + state.options.clone(), + ) + .await; + let driver_response = match result { + Ok(None) => return None, // No more pages to fetch + Err(err) => return Some((Err(err), None)), // Propagate error, terminates the stream after this response. + Ok(Some(r)) => r, + }; + + // Parse the response into a page + let response = + driver_bridge::driver_response_to_cosmos_response::>(driver_response); + let page = match QueryFeedPage::from_response(response) { + Ok(page) => page, + Err(err) => return Some((Err(err), None)), // Propagate error, terminates the stream after this response. + }; + Some((Ok(page), Some(state))) + }); + Box::pin(stream) +} + /// Represents a stream of items from a Cosmos DB query. /// /// See [`QueryFeedPage`] for more details on Cosmos DB feeds. @@ -273,15 +324,24 @@ pub struct FeedItemIterator { current: Option>, } -impl FeedItemIterator { +impl FeedItemIterator { /// Creates a new `FeedItemIterator` from a stream of pages. - pub(crate) fn new(plan: OperationPlan) -> Self { + pub(crate) fn new( + driver: Arc, + container: Option, + plan: OperationPlan, + options: OperationOptions, + ) -> Self { Self { - pages: Box::pin(stream), + pages: create_pagination_stream(driver, container, plan, options), current: None, } } + /// Converts this item iterator into a page iterator, yielding full pages instead of individual items. + /// + /// IMPORTANT: This will DISCARD any items from the current page that have not yet been yielded by the item iterator. + /// Use this method before consuming any items if you want to switch to page-based iteration. pub fn into_pages(self) -> FeedPageIterator { FeedPageIterator(self.pages) } @@ -322,12 +382,6 @@ impl Stream for FeedItemIterator { pub struct FeedPageIterator(BoxStream<'static, azure_core::Result>>); -impl FeedPageIterator { - pub fn new(driver: Arc, options: OperationOptions, plan: OperationPlan) -> Self { - driver.execute_operation() - } -} - impl Stream for FeedPageIterator { type Item = azure_core::Result>; @@ -367,7 +421,10 @@ mod tests { ]; let stream = futures::stream::iter(pages); - let item_iter = FeedItemIterator::new(stream); + let item_iter = FeedItemIterator { + pages: Box::pin(stream), + current: None, + }; let items: Vec<_> = item_iter .collect::>() @@ -386,7 +443,11 @@ mod tests { ]; let stream = futures::stream::iter(pages); - let page_iter = FeedItemIterator::new(stream).into_pages(); + let page_iter = FeedItemIterator { + pages: Box::pin(stream), + current: None, + } + .into_pages(); let page_items: Vec<_> = page_iter .collect::>() @@ -408,7 +469,10 @@ mod tests { ]; let stream = futures::stream::iter(pages); - let mut item_iter = FeedItemIterator::new(stream); + let mut item_iter = FeedItemIterator { + pages: Box::pin(stream), + current: None, + }; // First two items should succeed assert_eq!(item_iter.next().await.unwrap().unwrap(), 1); @@ -427,7 +491,10 @@ mod tests { ]; let stream = futures::stream::iter(pages); - let item_iter = FeedItemIterator::new(stream); + let item_iter = FeedItemIterator { + pages: Box::pin(stream), + current: None, + }; let items: Vec<_> = item_iter .collect::>() diff --git a/sdk/cosmos/azure_data_cosmos/src/models/cosmos_response.rs b/sdk/cosmos/azure_data_cosmos/src/models/cosmos_response.rs index d36e35e0ab3..716fe204f53 100644 --- a/sdk/cosmos/azure_data_cosmos/src/models/cosmos_response.rs +++ b/sdk/cosmos/azure_data_cosmos/src/models/cosmos_response.rs @@ -395,10 +395,7 @@ mod tests { let typed_response: Response> = raw_response.into(); let cosmos_response = CosmosResponse::new(typed_response, create_mock_request()); - let rt = tokio::runtime::Runtime::new().unwrap(); - let page = rt - .block_on(QueryFeedPage::from_response(cosmos_response)) - .unwrap(); + let page = QueryFeedPage::from_response(cosmos_response).unwrap(); assert_eq!( page.index_metrics(), Some(r#"{"UtilizedSingleIndexes":[]}"#) diff --git a/sdk/cosmos/azure_data_cosmos/src/query/mod.rs b/sdk/cosmos/azure_data_cosmos/src/query.rs similarity index 99% rename from sdk/cosmos/azure_data_cosmos/src/query/mod.rs rename to sdk/cosmos/azure_data_cosmos/src/query.rs index 44befb366fa..9cf04d30d17 100644 --- a/sdk/cosmos/azure_data_cosmos/src/query/mod.rs +++ b/sdk/cosmos/azure_data_cosmos/src/query.rs @@ -5,10 +5,6 @@ use serde::Serialize; -pub(crate) mod executor; - -pub use executor::QueryExecutor; - /// Represents a Cosmos DB Query, with optional parameters. /// /// # Examples diff --git a/sdk/cosmos/azure_data_cosmos/src/query/executor.rs b/sdk/cosmos/azure_data_cosmos/src/query/executor.rs deleted file mode 100644 index 11d79fa4c24..00000000000 --- a/sdk/cosmos/azure_data_cosmos/src/query/executor.rs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -//! Query execution implementation. - -use std::{collections::HashMap, sync::Arc}; - -use azure_core::http::headers::{HeaderName, HeaderValue}; -use azure_data_cosmos_driver::{ - models::{CosmosOperation, SessionToken}, - options::OperationOptions as DriverOperationOptions, - CosmosDriver, -}; -use serde::de::DeserializeOwned; - -use crate::{constants, driver_bridge, feed::FeedBody, Query, QueryFeedPage}; - -/// A query executor that sends queries through the Cosmos driver. -/// -/// This executor handles pagination via continuation tokens and works for -/// item queries (with partition key), database queries, and container queries. -/// The `operation_factory` closure produces the appropriate `CosmosOperation` -/// for each page request. -pub struct QueryExecutor { - driver: Arc, - operation_factory: Box CosmosOperation + Send>, - query: Query, - query_body: Option>, - base_options: DriverOperationOptions, - base_headers: HashMap, - session_token: Option, - continuation: Option, - complete: bool, - // Why is our phantom type a function? Because that represents how we _use_ the type T. - // Normally, PhantomData is only Send/Sync if T is, because PhantomData is indicating that while we don't _name_ T in a field, we should act as though we have a field of type T. - // However, we don't store any T values in this, we only RETURN them. - // That means we use a function pointer to indicate that we don't actually operate on T directly, we just return it. - // Because of this, PhantomData T> is Send/Sync even if T isn't (see https://doc.rust-lang.org/stable/nomicon/phantom-data.html#table-of-phantomdata-patterns) - phantom: std::marker::PhantomData T>, -} - -impl QueryExecutor { - pub(crate) fn new( - driver: Arc, - operation_factory: impl Fn() -> CosmosOperation + Send + 'static, - query: Query, - base_options: DriverOperationOptions, - session_token: Option, - ) -> Self { - // Pre-build the static headers that are the same for every page: - // user-provided custom headers + query-specific constants. - let mut base_headers = base_options.custom_headers().cloned().unwrap_or_default(); - base_headers.insert(constants::QUERY.clone(), HeaderValue::from_static("True")); - base_headers.insert( - azure_core::http::headers::CONTENT_TYPE, - HeaderValue::from_static("application/query+json"), - ); - - Self { - driver, - operation_factory: Box::new(operation_factory), - query, - query_body: None, - base_options, - base_headers, - session_token, - continuation: None, - complete: false, - phantom: std::marker::PhantomData, - } - } - - /// Consumes the executor and converts it into a stream of pages. - pub fn into_stream(self) -> azure_core::Result> { - Ok(crate::FeedItemIterator::new(futures::stream::try_unfold( - self, - |mut state| async move { - let val = state.next_page().await?; - Ok(val.map(|item| (item, state))) - }, - ))) - } - - /// Fetches the next page of query results. - /// - /// Returns `None` if there are no more pages to fetch. - pub async fn next_page(&mut self) -> azure_core::Result>> { - if self.complete { - return Ok(None); - } - - // Build a fresh operation for this page - let mut operation = (self.operation_factory)(); - - // Serialize the query body on the first page and cache it for subsequent pages. - if self.query_body.is_none() { - self.query_body = Some(serde_json::to_vec(&self.query)?); - } - operation = operation.with_body(self.query_body.clone().unwrap()); - - // The explicit session token serves as an initial hint; the driver's - // internal session manager captures response tokens and applies them - // to subsequent requests automatically. - if let Some(session_token) = &self.session_token { - operation = operation.with_session_token(session_token.clone()); - } - - // Clone the pre-built static headers and add the continuation token - // (the only header that changes between pages). - let mut headers = self.base_headers.clone(); - if let Some(continuation) = &self.continuation { - headers.insert( - constants::CONTINUATION.clone(), - HeaderValue::from(continuation.clone()), - ); - } - - let op_options = self.base_options.clone().with_custom_headers(headers); - - // Execute through the driver - let driver_response = self - .driver - .execute_operation(operation, op_options, None) - .await? - .ok_or_else(|| { - azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "query operation completed without producing a response", - ) - })?; - - // Bridge driver response to SDK types - let cosmos_response = - driver_bridge::driver_response_to_cosmos_response::>(driver_response); - - let page = QueryFeedPage::::from_response(cosmos_response).await?; - - match page.continuation() { - Some(token) => self.continuation = Some(token.to_string()), - None => self.complete = true, - } - - Ok(Some(page)) - } -} diff --git a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_containers.rs b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_containers.rs index cfebc801a37..367a692bc12 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_containers.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_containers.rs @@ -72,11 +72,13 @@ pub async fn container_crud_simple() -> Result<(), Box> { indexing_policy.indexing_mode.unwrap() ); - let mut query_pager = db_client.query_containers( - Query::from("SELECT * FROM root r WHERE r.id = @id") - .with_parameter("@id", &properties.id)?, - None, - )?; + let mut query_pager = db_client + .query_containers( + Query::from("SELECT * FROM root r WHERE r.id = @id") + .with_parameter("@id", &properties.id)?, + None, + ) + .await?; let mut ids = vec![]; while let Some(db) = query_pager.try_next().await? { ids.push(db.id); @@ -120,11 +122,13 @@ pub async fn container_crud_simple() -> Result<(), Box> { container_client.delete(None).await?; - query_pager = db_client.query_containers( - Query::from("SELECT * FROM root r WHERE r.id = @id") - .with_parameter("@id", &properties.id)?, - None, - )?; + query_pager = db_client + .query_containers( + Query::from("SELECT * FROM root r WHERE r.id = @id") + .with_parameter("@id", &properties.id)?, + None, + ) + .await?; let mut ids = vec![]; while let Some(db) = query_pager.try_next().await? { ids.push(db.id); diff --git a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_databases.rs b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_databases.rs index 2a30d13bed6..43bc5786bb2 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_databases.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_databases.rs @@ -37,7 +37,7 @@ pub async fn database_crud() -> Result<(), Box> { let query = Query::from("SELECT * FROM root r WHERE r.id = @id") .with_parameter("@id", &test_db_id)?; - let mut pager = cosmos_client.query_databases(query.clone(), None)?; + let mut pager = cosmos_client.query_databases(query.clone(), None).await?; let mut ids = Vec::new(); while let Some(db) = pager.try_next().await? { ids.push(db.id); @@ -50,7 +50,7 @@ pub async fn database_crud() -> Result<(), Box> { // We're testing delete, so we want to manually delete the DB rather than letting the clean-up process do it. db_client.delete(None).await?; - let mut pager = cosmos_client.query_databases(query, None)?; + let mut pager = cosmos_client.query_databases(query, None).await?; let mut ids = Vec::new(); while let Some(db) = pager.try_next().await? { ids.push(db.id); diff --git a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs index 91d2e2d2c3e..12cc7f98149 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs @@ -11,12 +11,14 @@ use std::error::Error; use azure_core::http::headers::HeaderValue; use azure_core::http::StatusCode; use azure_data_cosmos::{ + clients::DatabaseClient, constants, options::{OperationOptions, QueryOptions}, - Query, + PartitionKey, Query, }; use framework::{test_data, MockItem, TestClient}; use futures::{StreamExt, TryStreamExt}; +use serde::de::DeserializeOwned; fn collect_matching_items( items: &[MockItem], @@ -25,6 +27,56 @@ fn collect_matching_items( items.iter().filter(|p| predicate(p)).cloned().collect() } +#[derive(Default)] +struct QueryTestOptions { + max_item_count: Option, + use_continuation_token_resume: bool, +} + +async fn execute_query_test( + db_client: &DatabaseClient, + items: Vec, + query: impl Into, + partition_key: impl Into, + expected_items: Vec, + options: QueryTestOptions, +) -> Result<(), Box> +where + T: DeserializeOwned + Send + Eq + std::fmt::Debug + 'static, +{ + let container_client = test_data::create_container_with_items(db_client, items, None).await?; + + let mut query_options = QueryOptions::default(); + if let Some(max_item_count) = options.max_item_count { + query_options = query_options.with_max_item_count(max_item_count); + } + + let mut pages = container_client + .query_items::(query, partition_key, Some(query_options)) + .await? + .into_pages(); + + let mut actual_items = Vec::new(); + while let Some(page) = pages.next().await { + actual_items.extend(page?.into_items()); + } + + if options.use_continuation_token_resume { + // Placeholder for future continuation token-based resume support. + panic!("Continuation token resume support not yet implemented"); + } + + assert_eq!(expected_items, actual_items); + Ok(()) +} + +#[derive(serde::Deserialize, Debug, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +struct ItemProjection { + id: String, + merge_order: usize, +} + #[tokio::test] #[cfg_attr( not(test_category = "emulator"), @@ -32,18 +84,20 @@ fn collect_matching_items( )] pub async fn single_partition_query_simple() -> Result<(), Box> { TestClient::run_with_unique_db( - async |run_context, db_client| { + async |_, db_client| { let items = test_data::generate_mock_items(10, 10); - let container_client = - test_data::create_container_with_items(db_client, items.clone(), None).await?; + let expected_items = + collect_matching_items(&items, |p| p.partition_key == "partition0"); - let result_items: Vec = run_context - .query_items(&container_client, "select * from docs c", "partition0") - .await?; - assert_eq!( - collect_matching_items(&items, |p| p.partition_key == "partition0"), - result_items - ); + execute_query_test( + db_client, + items, + "select * from docs c", + "partition0", + expected_items, + QueryTestOptions::default(), + ) + .await?; Ok(()) }, @@ -59,10 +113,8 @@ pub async fn single_partition_query_simple() -> Result<(), Box> { )] pub async fn single_partition_query_with_parameters() -> Result<(), Box> { TestClient::run_with_unique_db( - async |run_context, db_client| { + async |_, db_client| { let items = test_data::generate_mock_items(10, 10); - let container_client = - test_data::create_container_with_items(db_client, items.clone(), None).await?; // Find a merge order value in partition1's items let merge_order = items @@ -74,13 +126,17 @@ pub async fn single_partition_query_with_parameters() -> Result<(), Box = run_context - .query_items(&container_client, query, "partition1") - .await?; - assert_eq!( - collect_matching_items(&items, |p| p.merge_order == merge_order), - result_items - ); + let expected_items = collect_matching_items(&items, |p| p.merge_order == merge_order); + + execute_query_test( + db_client, + items, + query, + "partition1", + expected_items, + QueryTestOptions::default(), + ) + .await?; Ok(()) }, @@ -96,22 +152,26 @@ pub async fn single_partition_query_with_parameters() -> Result<(), Box Result<(), Box> { TestClient::run_with_unique_db( - async |run_context, db_client| { + async |_, db_client| { let items = test_data::generate_mock_items(10, 10); - let container_client = - test_data::create_container_with_items(db_client, items.clone(), None).await?; - - let result_items: Vec = run_context - .query_items(&container_client, "select value c.id from c", "partition1") - .await?; - assert_eq!( - items - .iter() - .filter(|p| p.partition_key == "partition1") - .map(|p| p.id.to_string()) - .collect::>(), - result_items - ); + let expected_items = items + .iter() + .filter(|p| p.partition_key == "partition1") + .map(|p| ItemProjection { + id: p.id.to_string(), + merge_order: p.merge_order, + }) + .collect::>(); + + execute_query_test( + db_client, + items, + "select c.id, c.mergeOrder from c", + "partition1", + expected_items, + QueryTestOptions::default(), + ) + .await?; Ok(()) }, @@ -127,27 +187,23 @@ pub async fn single_partition_query_with_projection() -> Result<(), Box Result<(), Box> { TestClient::run_with_unique_db( - async |run_context, db_client| { + async |_, db_client| { let items = test_data::generate_mock_items(10, 2); - let container_client = - test_data::create_container_with_items(db_client, items.clone(), None).await?; - - let result_items: Vec = run_context - .query_items( - &container_client, - "select value c.id from c where c.mergeOrder between 40 and 60", - (), - ) - .await?; - - assert_eq!( - items - .iter() - .filter(|p| p.merge_order >= 40 && p.merge_order <= 60) - .map(|p| p.id.to_string()) - .collect::>(), - result_items - ); + let expected_items = items + .iter() + .filter(|p| p.merge_order >= 40 && p.merge_order <= 60) + .map(|p| p.id.to_string()) + .collect::>(); + + execute_query_test( + db_client, + items, + "select value c.id from c where c.mergeOrder between 40 and 60", + (), + expected_items, + QueryTestOptions::default(), + ) + .await?; Ok(()) }, @@ -161,19 +217,16 @@ pub async fn cross_partition_query_with_projection_and_filter() -> Result<(), Bo not(test_category = "emulator"), ignore = "requires test_category 'emulator'" )] -pub async fn cross_partition_query_with_order_by_fails_without_query_engine( -) -> Result<(), Box> { +pub async fn cross_partition_query_with_order_by_fails() -> Result<(), Box> { TestClient::run_with_unique_db( async |_, db_client| { let items = test_data::generate_mock_items(10, 10); let container_client = test_data::create_container_with_items(db_client, items.clone(), None).await?; - let mut pager = container_client.query_items::( - "select value c.id from c order by c.mergeOrder", - (), - None, - )?; + let mut pager = container_client + .query_items::("select value c.id from c order by c.mergeOrder", (), None) + .await?; let result = pager.try_next().await; let Err(err) = result else { @@ -226,7 +279,7 @@ pub async fn query_returns_index_and_query_metrics() -> Result<(), Box("select * from c", "partition0", Some(options))? + .query_items::("select * from c", "partition0", Some(options)).await? .into_pages(); // Get the first page and check metrics headers @@ -289,9 +342,6 @@ pub async fn single_partition_query_pagination() -> Result<(), Box> { TestClient::run_with_unique_db( async |_, db_client| { let items = test_data::generate_mock_items(1, 5); - let container_client = - test_data::create_container_with_items(db_client, items.clone(), None).await?; - let expected_items = collect_matching_items(&items, |p| p.partition_key == "partition0"); assert!( @@ -299,37 +349,18 @@ pub async fn single_partition_query_pagination() -> Result<(), Box> { "need multiple items to test pagination" ); - // Force 1 item per page to exercise continuation token pagination - let mut custom_headers = HashMap::new(); - custom_headers.insert(constants::MAX_ITEM_COUNT, HeaderValue::from_static("1")); - let operation = OperationOptions::default().with_custom_headers(custom_headers); - let options = QueryOptions::default().with_operation_options(operation); - - let mut pages = container_client - .query_items::("select * from c", "partition0", Some(options))? - .into_pages(); - - let mut all_items = Vec::new(); - let mut page_count = 0; - - while let Some(page) = pages.next().await { - let page = page?; - assert!( - page.items().len() <= 1, - "expected at most 1 item per page, got {}", - page.items().len() - ); - all_items.extend(page.into_items()); - page_count += 1; - } - - assert!( - page_count >= expected_items.len(), - "expected at least {} pages with max-item-count=1, got {}", - expected_items.len(), - page_count - ); - assert_eq!(expected_items, all_items); + execute_query_test( + db_client, + items, + "select * from c", + "partition0", + expected_items, + QueryTestOptions { + max_item_count: Some(1), + use_continuation_token_resume: false, + }, + ) + .await?; Ok(()) }, @@ -347,44 +378,19 @@ pub async fn cross_partition_query_pagination() -> Result<(), Box> { TestClient::run_with_unique_db( async |_, db_client| { let items = test_data::generate_mock_items(3, 3); - let container_client = - test_data::create_container_with_items(db_client, items.clone(), None).await?; - - // Force 1 item per page for cross-partition query - let mut custom_headers = HashMap::new(); - custom_headers.insert(constants::MAX_ITEM_COUNT, HeaderValue::from_static("1")); - let operation = OperationOptions::default().with_custom_headers(custom_headers); - let options = QueryOptions::default().with_operation_options(operation); - - let mut pages = container_client - .query_items::("select * from c", (), Some(options))? - .into_pages(); - - let mut all_items = Vec::new(); - let mut page_count = 0; - - while let Some(page) = pages.next().await { - let page = page?; - assert!( - page.items().len() <= 1, - "expected at most 1 item per page, got {}", - page.items().len() - ); - all_items.extend(page.into_items()); - page_count += 1; - } - assert!( - page_count > 1, - "expected multiple pages with max-item-count=1, got {}", - page_count - ); - // Cross-partition ordering is not guaranteed, so just check count - assert_eq!( - items.len(), - all_items.len(), - "expected all items to be returned across pages" - ); + execute_query_test( + db_client, + items.clone(), + "select * from c", + (), + items, + QueryTestOptions { + max_item_count: Some(1), + use_continuation_token_resume: false, + }, + ) + .await?; Ok(()) }, diff --git a/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs b/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs index ff193a2eb0d..7fdaf242f4b 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs @@ -639,7 +639,10 @@ impl TestRunContext { const MAX_BACKOFF: Duration = Duration::from_secs(10); loop { - match container.query_items::(query.clone(), partition_key.clone(), None) { + match container + .query_items::(query.clone(), partition_key.clone(), None) + .await + { Ok(pager) => match pager.try_collect::>().await { Ok(items) => return Ok(items), Err(e) if e.http_status() == Some(StatusCode::NotFound) => { @@ -854,7 +857,7 @@ impl TestRunContext { "SELECT * FROM root r WHERE r.id LIKE 'auto-test-{}'", self.run_id )); - let mut pager = self.client().query_databases(query, None)?; + let mut pager = self.client().query_databases(query, None).await?; let mut ids = Vec::new(); while let Some(db) = pager.try_next().await? { ids.push(db.id); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 29c0ae9ae5c..d3e16ab06f0 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -1097,7 +1097,7 @@ impl CosmosDriver { ), )); } - tracing::debug!("operation started"); + tracing::debug!("plan execution started"); let mut executor = DriverRequestExecutor { driver: self, @@ -1124,6 +1124,14 @@ impl CosmosDriver { overrides: OperationOverrides, options: &OperationOptions, ) -> azure_core::Result { + tracing::debug!( + operation_type = ?operation.operation_type(), + resource_type = ?operation.resource_type(), + resource_reference = ?operation.resource_reference(), + overrides = ?overrides, + body_length = operation.body().map(|b| b.len()), + "executing operation"); + // Step 1: Build the single OperationOptionsView for layered resolution. let effective_options = self.operation_options_view(options); @@ -1342,9 +1350,11 @@ impl CosmosDriver { )); } + tracing::debug!(operation_type = ?operation.operation_type(), resource_type = ?operation.resource_type(), resource_reference = ?operation.resource_reference(), "planning operation"); + // Trivial plan: anything that isn't a cross-partition query. if operation.is_trivial() { - let pipeline = planner::build_trivial_pipeline(&operation)?; + let pipeline = planner::build_trivial_pipeline(operation)?; return Ok(OperationPlan::new(pipeline)); } @@ -1382,7 +1392,7 @@ impl CosmosDriver { |container, continuation| self.fetch_partition_key_ranges(container, continuation), ); - let pipeline = planner::build_sequential_drain(&query_plan, &mut topology, &operation).await?; + let pipeline = planner::build_sequential_drain(&query_plan, &mut topology, operation).await?; Ok(OperationPlan::new(pipeline)) } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs index eeefd749bca..529c6948c06 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs @@ -49,7 +49,7 @@ pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core: "operation target {target_desc} is not valid for resource type {resource_type}", target_desc = target_description(target), ), - )); + ))?; } let request_target = match target { @@ -64,7 +64,7 @@ pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core: } }; - let root = Request::new(operation.clone(), request_target); + let root = Request::new(operation.clone(), request_target, None); Ok(Pipeline::new(Box::new(root))) } @@ -104,7 +104,7 @@ pub(crate) async fn build_sequential_drain( range: resolved_range.range, partition_key_range_id: resolved_range.partition_key_range_id, }; - request_nodes.push(Box::new(Request::new(operation.clone(), target))); + request_nodes.push(Box::new(Request::new(operation.clone(), target, None))); } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index a5433cce133..937cbf540d3 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -40,31 +40,43 @@ impl RequestTarget { } } +#[derive(Debug, PartialEq, Eq)] +enum RequestState { + /// No request has been sent yet. The next page will trigger the initial request. + Initial, + + /// A request has been sent and a server continuation token has been received, but not all pages have been drained yet. The next page will trigger a request with the continuation token. + Continuing { continuation: String }, + + /// All pages have been drained. No further requests will be sent. + Drained, +} + /// Leaf node that executes one Cosmos DB request per page. pub(crate) struct Request { operation: CosmosOperation, target: RequestTarget, - latest_server_continuation: Option, - logical_partition_topology_retry_used: bool, + state: RequestState, } impl Request { /// Creates a request node. - pub(crate) fn new(operation: CosmosOperation, target: RequestTarget) -> Self { - Self::with_continuation(operation, target, None) - } - - /// Creates a request node restored with the latest server-issued continuation. - pub(crate) fn with_continuation( + pub(crate) fn new( operation: CosmosOperation, target: RequestTarget, - latest_server_continuation: Option, + initial_continuation: Option, ) -> Self { + let initial_state = if let Some(token) = initial_continuation { + RequestState::Continuing { + continuation: token, + } + } else { + RequestState::Initial + }; Self { operation, target, - latest_server_continuation, - logical_partition_topology_retry_used: false, + state: initial_state, } } @@ -77,11 +89,6 @@ impl Request { pub(crate) fn target(&self) -> &RequestTarget { &self.target } - - /// Returns the latest server-issued continuation for this request's partition. - pub(crate) fn latest_server_continuation(&self) -> Option<&str> { - self.latest_server_continuation.as_deref() - } } #[async_trait] @@ -90,20 +97,31 @@ impl PipelineNode for Request { &mut self, context: &mut PipelineContext<'_>, ) -> azure_core::Result { + tracing::trace!( + target = ?self.target, + state = ?self.state, + "executing request node" + ); + + let continuation = match &self.state { + RequestState::Initial => None, + RequestState::Continuing { continuation } => Some(continuation.clone()), + RequestState::Drained => return Ok(PageResult::Drained), + }; + match context .execute_request( &self.operation, self.target.clone(), PartitionRoutingRefresh::UseCached, - self.latest_server_continuation.clone(), + continuation.clone(), ) .await { - Ok(response) => Ok(PageResult::Page( - self.record_response_continuation(response), - )), + Ok(response) => Ok(self.handle_response(response)), Err(error) if is_partition_topology_change(&error) => { - self.handle_partition_topology_change(context, error).await + self.handle_partition_topology_change(context, error, continuation) + .await } Err(error) => Err(error), } @@ -118,10 +136,30 @@ impl PipelineNode for Request { } } impl Request { + fn handle_response(&mut self, response: CosmosResponse) -> PageResult { + let continuation = response.headers().continuation.clone(); + tracing::trace!( + target = ?self.target, + status = ?response.status(), + output_continuation = ?continuation, + "request completed" + ); + self.state = if let Some(token) = continuation { + RequestState::Continuing { + continuation: token, + } + } else { + RequestState::Drained + }; + tracing::trace!(target = ?self.target, state = ?self.state, "updated request state after response"); + PageResult::Page(response) + } + async fn handle_partition_topology_change( &mut self, context: &mut PipelineContext<'_>, error: azure_core::Error, + continuation: Option, ) -> azure_core::Result { match &self.target { RequestTarget::NonPartitioned => { @@ -129,25 +167,27 @@ impl Request { Err(error) } RequestTarget::LogicalPartitionKey(_) => { - if self.logical_partition_topology_retry_used { - return Err(error); - } - // This shouldn't really happen, but it's been observed. // Since the original request had a logical partition key, // the gateway should have been able to route the request // to the correct partition even if it has split. // But we can do a single retry without forcing a topology refresh to see if it succeeds. - self.logical_partition_topology_retry_used = true; context .execute_request( &self.operation, self.target.clone(), PartitionRoutingRefresh::ForceRefresh, - self.latest_server_continuation.clone(), + continuation, ) .await - .map(|response| PageResult::Page(self.record_response_continuation(response))) + .map(|response| { + tracing::trace!( + target = ?self.target, + status = ?response.status(), + "retry after logical partition key topology change succeeded" + ); + self.handle_response(response) + }) } RequestTarget::EffectivePartitionKeyRange { range, .. } => { let range = range.clone(); @@ -178,26 +218,22 @@ impl Request { // covers the same starting EPK. For a split, only the left-most child // inherits the continuation since it resumes where this node left off. // TODO: When we support streaming ordered merges, we'll need to augment this a bit. - let continuation = if target.covers_start_of(range) { - self.latest_server_continuation.clone() - } else { - None + let continuation = match (target.covers_start_of(range), &self.state) { + ( + true, + RequestState::Continuing { + continuation: latest_server_continuation, + }, + ) => Some(latest_server_continuation.clone()), + _ => None, }; - Box::new(Request::with_continuation( - self.operation.clone(), - target, - continuation, - )) as Box + Box::new(Request::new(self.operation.clone(), target, continuation)) + as Box }) .collect(); Ok(PageResult::SplitRequired { replacement_nodes }) } - - fn record_response_continuation(&mut self, response: CosmosResponse) -> CosmosResponse { - self.latest_server_continuation = response.headers().continuation.clone(); - response - } } // Partition topology changes are a specific subset of `Gone` substatus codes. @@ -233,7 +269,7 @@ mod tests { #[tokio::test] async fn request_retries_logical_partition_key_topology_change_once() { - let mut request = Request::new(operation(), logical_partition_target()); + let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Ok(response(b"ok"))]); let mut topology = NoopTopologyProvider; let mut context = PipelineContext::new(&mut executor, &mut topology); @@ -253,7 +289,7 @@ mod tests { #[tokio::test] async fn request_returns_second_logical_partition_key_topology_change() { - let mut request = Request::new(operation(), logical_partition_target()); + let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Err(gone_error())]); let mut topology = NoopTopologyProvider; let mut context = PipelineContext::new(&mut executor, &mut topology); @@ -273,7 +309,7 @@ mod tests { #[tokio::test] async fn request_does_not_retry_non_topology_gone() { - let mut request = Request::new(operation(), logical_partition_target()); + let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(non_topology_gone_error())]); let mut topology = NoopTopologyProvider; let mut context = PipelineContext::new(&mut executor, &mut topology); @@ -290,7 +326,7 @@ mod tests { #[tokio::test] async fn request_tracks_server_continuation_for_next_page() { - let mut request = Request::new(operation(), logical_partition_target()); + let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![ Ok(response_with_continuation(b"page1", Some("token-1"))), Ok(response_with_continuation(b"page2", Some("token-2"))), @@ -307,12 +343,17 @@ mod tests { executor.continuation_calls, vec![None, Some("token-1".to_string())] ); - assert_eq!(request.latest_server_continuation(), Some("token-2")); + assert_eq!( + request.state, + RequestState::Continuing { + continuation: "token-2".to_string() + } + ); } #[tokio::test] async fn request_uses_restored_continuation_on_first_page() { - let mut request = Request::with_continuation( + let mut request = Request::new( operation(), logical_partition_target(), Some("restored-token".to_string()), @@ -328,14 +369,14 @@ mod tests { executor.continuation_calls, vec![Some("restored-token".to_string())] ); - assert_eq!(request.latest_server_continuation(), None); + assert_eq!(request.state, RequestState::Drained); } // ── Split recovery tests ────────────────────────────────────────────── #[tokio::test] async fn epk_range_topology_change_returns_split_required() { - let mut request = Request::new(operation(), epk_range_target()); + let mut request = Request::new(operation(), epk_range_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); let mut topology = MockTopologyProvider::new(vec![Ok(vec![ ResolvedRange { @@ -390,7 +431,7 @@ mod tests { #[tokio::test] async fn split_left_child_inherits_continuation() { - let mut request = Request::with_continuation( + let mut request = Request::new( operation(), epk_range_target(), Some("server-token".to_string()), @@ -419,15 +460,17 @@ mod tests { PageResult::SplitRequired { replacement_nodes } => { let left = replacement_nodes[0].downcast_ref::().unwrap(); assert_eq!( - left.latest_server_continuation(), - Some("server-token"), + left.state, + RequestState::Continuing { + continuation: "server-token".to_string() + }, "left-most child should inherit the server continuation" ); let right = replacement_nodes[1].downcast_ref::().unwrap(); assert_eq!( - right.latest_server_continuation(), - None, + right.state, + RequestState::Initial, "non-left children should have no continuation" ); } @@ -441,7 +484,7 @@ mod tests { EffectivePartitionKey::from("10"), EffectivePartitionKey::from("90"), ); - let mut request = Request::with_continuation( + let mut request = Request::new( operation(), RequestTarget::EffectivePartitionKeyRange { range: range.clone(), @@ -480,11 +523,16 @@ mod tests { PageResult::SplitRequired { replacement_nodes } => { assert_eq!(replacement_nodes.len(), 3); let left = replacement_nodes[0].downcast_ref::().unwrap(); - assert_eq!(left.latest_server_continuation(), Some("ct")); + assert_eq!( + left.state, + RequestState::Continuing { + continuation: "ct".to_string() + } + ); let mid = replacement_nodes[1].downcast_ref::().unwrap(); - assert_eq!(mid.latest_server_continuation(), None); + assert_eq!(mid.state, RequestState::Initial); let right = replacement_nodes[2].downcast_ref::().unwrap(); - assert_eq!(right.latest_server_continuation(), None); + assert_eq!(right.state, RequestState::Initial); } other => panic!("expected SplitRequired, got {:?}", other), } @@ -492,7 +540,7 @@ mod tests { #[tokio::test] async fn topology_provider_error_propagates() { - let mut request = Request::new(operation(), epk_range_target()); + let mut request = Request::new(operation(), epk_range_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); let mut topology = MockTopologyProvider::new(vec![Err(azure_core::Error::with_message( azure_core::error::ErrorKind::Other, @@ -506,7 +554,7 @@ mod tests { #[tokio::test] async fn non_partitioned_topology_change_not_retried() { - let mut request = Request::new(operation(), RequestTarget::NonPartitioned); + let mut request = Request::new(operation(), RequestTarget::NonPartitioned, None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); let mut topology = NoopTopologyProvider; let mut context = PipelineContext::new(&mut executor, &mut topology); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs index 04f50df1ca8..dde5dca5ec2 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs @@ -10,7 +10,10 @@ use std::sync::Arc; use std::time::{Duration, Instant}; -use azure_core::http::headers::{AsHeaders, HeaderName, HeaderValue}; +use azure_core::http::{ + headers::{AsHeaders, HeaderName, HeaderValue}, + request::options::ContentType, +}; use crate::{ diagnostics::{DiagnosticsContextBuilder, ExecutionContext, PipelineType, TransportSecurity}, @@ -19,8 +22,9 @@ use crate::{ LocationStateStore, }, models::{ - request_header_names, AccountEndpoint, ActivityId, CosmosOperation, CosmosResponse, - Credential, DefaultConsistencyLevel, OperationType, SessionToken, SubStatusCode, + cosmos_headers::QUERY_CONTENT_TYPE, request_header_names, AccountEndpoint, ActivityId, + CosmosOperation, CosmosResponse, Credential, DefaultConsistencyLevel, OperationType, + SessionToken, SubStatusCode, }, options::{OperationOptionsView, ReadConsistencyStrategy, ThroughputControlGroupSnapshot}, }; @@ -247,6 +251,8 @@ pub(crate) async fn execute_operation_pipeline( tracing::trace!( method = ?transport_request.method, url = %transport_request.url, + headers = ?transport_request.headers, + body = ?transport_request.body.as_ref().map(|b| std::str::from_utf8(b).unwrap_or("")), "transport request created"); let selected_transport = match pipeline_type { @@ -556,30 +562,53 @@ fn build_transport_request( ); } - // Cosmos DB uses POST for both create and upsert; the service - // distinguishes them via this header. - if operation.operation_type() == OperationType::Upsert { - headers.insert( - HeaderName::from_static(request_header_names::IS_UPSERT), - HeaderValue::from_static("true"), - ); - } - - // Cosmos DB uses POST for batch (same endpoint as create/upsert); - // the service requires these headers to process the request as a batch. - if operation.operation_type() == OperationType::Batch { - headers.insert( - HeaderName::from_static(request_header_names::IS_BATCH_REQUEST), - HeaderValue::from_static("True"), - ); - headers.insert( - HeaderName::from_static(request_header_names::BATCH_ATOMIC), - HeaderValue::from_static("True"), - ); - headers.insert( - HeaderName::from_static(request_header_names::BATCH_CONTINUE_ON_ERROR), - HeaderValue::from_static("False"), - ); + // Apply operation type-specific headers. + match operation.operation_type() { + OperationType::Upsert => { + headers.insert( + HeaderName::from_static(request_header_names::IS_UPSERT), + HeaderValue::from_static("true"), + ); + } + OperationType::Batch => { + headers.insert( + HeaderName::from_static(request_header_names::IS_BATCH_REQUEST), + HeaderValue::from_static("True"), + ); + headers.insert( + HeaderName::from_static(request_header_names::BATCH_ATOMIC), + HeaderValue::from_static("True"), + ); + headers.insert( + HeaderName::from_static(request_header_names::BATCH_CONTINUE_ON_ERROR), + HeaderValue::from_static("False"), + ); + } + OperationType::Query | OperationType::SqlQuery => { + headers.insert( + HeaderName::from_static(request_header_names::IS_QUERY), + HeaderValue::from_static("True"), + ); + headers.insert( + azure_core::http::headers::CONTENT_TYPE, + HeaderValue::from_static(QUERY_CONTENT_TYPE), + ); + } + OperationType::QueryPlan => { + headers.insert( + HeaderName::from_static(request_header_names::IS_QUERY), + HeaderValue::from_static("True"), + ); + headers.insert( + azure_core::http::headers::CONTENT_TYPE, + HeaderValue::from_static(QUERY_CONTENT_TYPE), + ); + headers.insert( + HeaderName::from_static(request_header_names::IS_QUERY_PLAN_REQUEST), + HeaderValue::from_static("True"), + ); + } + _ => {} } // Add operation type header for fault injection rule matching diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs index ff0cdde4470..5be064ff60c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs @@ -284,7 +284,10 @@ pub(crate) async fn execute_transport_pipeline( diagnostics.set_fault_injection_evaluations(request_handle, evals); } } - tracing::debug!("transport request complete"); + tracing::debug!( + outcome = ?result.result.outcome, + "transport request complete" + ); if result.shard_id.is_some_and(|failed_shard_id| { local_connectivity_retry_count < MAX_LOCAL_CONNECTIVITY_RETRIES diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs index 85ea6e668dc..13a4f6309a5 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs @@ -20,6 +20,9 @@ pub(crate) mod request_header_names { pub const IF_MATCH: &str = "if-match"; pub const IF_NONE_MATCH: &str = "if-none-match"; pub const PREFER: &str = "prefer"; + pub const IS_QUERY: &str = "x-ms-documentdb-isquery"; + pub const IS_QUERY_PLAN_REQUEST: &str = "x-ms-cosmos-is-query-plan-request"; + pub const SUPPORTED_QUERY_FEATURES: &str = "x-ms-cosmos-supported-query-features"; pub const IS_UPSERT: &str = "x-ms-documentdb-is-upsert"; pub const IS_BATCH_REQUEST: &str = "x-ms-cosmos-is-batch-request"; pub const BATCH_ATOMIC: &str = "x-ms-cosmos-batch-atomic"; @@ -33,7 +36,6 @@ pub(crate) mod request_header_names { pub const END_EPK: &str = "x-ms-end-epk"; pub const PARTITION_KEY: &str = "x-ms-documentdb-partitionkey"; pub const PARTITION_KEY_RANGE_ID: &str = "x-ms-documentdb-partitionkeyrangeid"; - pub const SUPPORTED_QUERY_FEATURES: &str = "x-ms-cosmos-supported-query-features"; } /// Standard Cosmos DB response header names. @@ -57,6 +59,8 @@ pub(crate) mod response_header_names { pub const INTERNAL_PARTITION_ID: &str = "x-ms-cosmos-internal-partition-id"; } +pub const QUERY_CONTENT_TYPE: &str = "application/query+json"; + /// Header names used by the fault injection framework. #[cfg(feature = "fault_injection")] pub(crate) mod fault_injection_header_names { From 7748c489ccedce6c7b8877ad65bb231d835dc39e Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 17:05:00 +0000 Subject: [PATCH 22/29] Consolidate FeedRange types --- .../src/clients/container_client.rs | 23 +- .../azure_data_cosmos/src/feed_range.rs | 477 ------------------ sdk/cosmos/azure_data_cosmos/src/lib.rs | 4 +- .../azure_data_cosmos/src/session_helpers.rs | 17 +- .../src/models/feed_range.rs | 254 +++++++++- 5 files changed, 267 insertions(+), 508 deletions(-) delete mode 100644 sdk/cosmos/azure_data_cosmos/src/feed_range.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 214f99ee8aa..59186ad3997 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -3,7 +3,6 @@ use crate::{ clients::{offers_client, ClientContext}, - feed_range::FeedRange, models::{ BatchResponse, ContainerProperties, ItemResponse, ResourceResponse, ThroughputProperties, }, @@ -12,14 +11,13 @@ use crate::{ SessionToken, }, transactional_batch::TransactionalBatch, - DeleteContainerOptions, FeedItemIterator, FeedPageIterator, ItemReadOptions, ItemWriteOptions, + DeleteContainerOptions, FeedItemIterator, FeedRange, ItemReadOptions, ItemWriteOptions, PartitionKey, Query, ReplaceContainerOptions, ThroughputOptions, }; use super::ThroughputPoller; use azure_data_cosmos_driver::models::{ - effective_partition_key::EffectivePartitionKey as DriverEpk, ContainerReference, - CosmosOperation, ItemReference, OperationTarget, PartitionKeyKind, + ContainerReference, CosmosOperation, ItemReference, OperationTarget, PartitionKeyKind, }; use azure_data_cosmos_driver::options::OperationOptions; use serde::{de::DeserializeOwned, Serialize}; @@ -712,7 +710,7 @@ impl ContainerClient { pub async fn query_items( &self, query: impl Into, - partition_key: QueryTarget, + partition_key: impl Into, options: Option, ) -> azure_core::Result> { let options = options.unwrap_or_default(); @@ -852,10 +850,7 @@ impl ContainerClient { )); } - ranges - .iter() - .map(FeedRange::from_partition_key_range) - .collect() + ranges.iter().map(FeedRange::try_from).collect() } /// Returns the [`FeedRange`]s covering the given partition key. @@ -937,15 +932,9 @@ impl ContainerClient { )); } - ranges - .iter() - .map(FeedRange::from_partition_key_range) - .collect() + ranges.iter().map(FeedRange::try_from).collect() } else { - ranges - .iter() - .map(FeedRange::from_partition_key_range) - .collect() + ranges.iter().map(FeedRange::try_from).collect() } } diff --git a/sdk/cosmos/azure_data_cosmos/src/feed_range.rs b/sdk/cosmos/azure_data_cosmos/src/feed_range.rs deleted file mode 100644 index a5d560a6b8a..00000000000 --- a/sdk/cosmos/azure_data_cosmos/src/feed_range.rs +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -//! Types for working with feed ranges in Azure Cosmos DB. -//! -//! A [`FeedRange`] represents a contiguous range of partitions in a Cosmos DB container, -//! defined by effective partition key (EPK) boundaries. Feed ranges enable: -//! -//! - Parallel query processing by distributing ranges across workers -//! - Scoped change feed consumption for specific partitions -//! - Workload distribution across multiple consumers -//! -//! # Examples -//! -//! ```rust,no_run -//! # use azure_data_cosmos::clients::ContainerClient; -//! # async fn example(container: ContainerClient) -> azure_core::Result<()> { -//! // Get physical partition feed ranges -//! let ranges = container.read_feed_ranges(None).await?; -//! println!("Container has {} physical partitions", ranges.len()); -//! -//! // Serialize/deserialize for storage or transfer -//! let serialized = ranges[0].to_string(); -//! let restored: azure_data_cosmos::FeedRange = serialized.parse()?; -//! assert_eq!(ranges[0], restored); -//! # Ok(()) -//! # } -//! ``` - -use azure_core::fmt::SafeDebug; -use base64::Engine; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::str::FromStr; - -use azure_data_cosmos_driver::models::partition_key_range::PartitionKeyRange; - -use crate::hash::EffectivePartitionKey; -use crate::hash::{MAX_EXCLUSIVE_EFFECTIVE_PARTITION_KEY, MIN_INCLUSIVE_EFFECTIVE_PARTITION_KEY}; - -/// An opaque representation of a contiguous range of partitions in a Cosmos DB container. -/// -/// Feed ranges are defined by effective partition key (EPK) boundaries and map to one or more -/// physical partitions. They are obtained from [`ContainerClient::read_feed_ranges()`](crate::clients::ContainerClient::read_feed_ranges) -/// or [`ContainerClient::feed_range_from_partition_key()`](crate::clients::ContainerClient::feed_range_from_partition_key). -/// -/// Feed ranges can be serialized to strings (via [`std::fmt::Display`]/[`std::str::FromStr`]) for storage or transfer -/// between processes. The serialization format is base64-encoded JSON, compatible with other -/// Azure Cosmos DB SDKs. -/// -/// # Serialization Formats -/// -/// `FeedRange` supports two distinct serialization formats: -/// -/// - **[`Display`](std::fmt::Display)/[`FromStr`]** — base64-encoded JSON, intended for string storage and cross-SDK transfer. -/// - **[`Serialize`]/[`Deserialize`]** — structured JSON (`{"Range": {...}}`), intended for embedding in JSON documents. -/// -/// These formats are **not interchangeable**: a value serialized with one cannot be deserialized with the other. -#[derive(Clone, SafeDebug, PartialEq, Eq, Hash)] -#[non_exhaustive] -pub struct FeedRange { - pub(crate) min_inclusive: EffectivePartitionKey, - pub(crate) max_exclusive: EffectivePartitionKey, -} - -/// JSON wire format matching the cross-SDK feed range representation. -/// -/// Example: -/// ```json -/// {"Range": {"min": "", "max": "FF", "isMinInclusive": true, "isMaxInclusive": false}} -/// ``` -#[derive(Serialize, Deserialize)] -struct FeedRangeJson { - #[serde(rename = "Range")] - range: RangeJson, -} - -#[derive(Serialize, Deserialize)] -struct RangeJson { - min: String, - max: String, - #[serde(rename = "isMinInclusive")] - is_min_inclusive: bool, - #[serde(rename = "isMaxInclusive")] - is_max_inclusive: bool, -} - -impl FeedRange { - /// Creates a feed range covering the entire partition key space. - /// - /// This range spans from the minimum to maximum effective partition key values, - /// encompassing all partitions in a container. - pub fn full() -> Self { - Self { - min_inclusive: EffectivePartitionKey::from(MIN_INCLUSIVE_EFFECTIVE_PARTITION_KEY), - max_exclusive: EffectivePartitionKey::from(MAX_EXCLUSIVE_EFFECTIVE_PARTITION_KEY), - } - } - - /// Returns `true` if this feed range is entirely contained within `other`. - pub(crate) fn is_subset_of(&self, other: &FeedRange) -> bool { - other.min_inclusive <= self.min_inclusive && other.max_exclusive >= self.max_exclusive - } - - /// Returns `true` if this feed range and `other` share any portion of the EPK space. - /// - /// Two feed ranges overlap when one starts before the other ends and vice versa. - pub(crate) fn overlaps(&self, other: &FeedRange) -> bool { - self.min_inclusive < other.max_exclusive && other.min_inclusive < self.max_exclusive - } - - /// Returns `true` if this feed range can be combined with `other`. - /// - /// Two ranges can be combined when they overlap or are adjacent - /// (one's max equals the other's min). - pub(crate) fn can_merge(&self, other: &FeedRange) -> bool { - self.max_exclusive >= other.min_inclusive && other.max_exclusive >= self.min_inclusive - } - - /// Combines this feed range with `other` into a bounding range. - pub(crate) fn merge_with(&self, other: &FeedRange) -> FeedRange { - debug_assert!( - self.can_merge(other), - "merge_with called on disjoint ranges" - ); - FeedRange { - min_inclusive: std::cmp::min(self.min_inclusive.clone(), other.min_inclusive.clone()), - max_exclusive: std::cmp::max(self.max_exclusive.clone(), other.max_exclusive.clone()), - } - } - - /// Creates a `FeedRange` from a driver `PartitionKeyRange`. - /// - /// Partition key ranges from the service always use `[min, max)` semantics - /// (min inclusive, max exclusive). Returns an error if the range is inverted. - pub(crate) fn from_partition_key_range(pkr: &PartitionKeyRange) -> azure_core::Result { - if pkr.min_inclusive > pkr.max_exclusive { - return Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::DataConversion, - "partition key range min_inclusive must be <= max_exclusive", - )); - } - Ok(Self { - min_inclusive: EffectivePartitionKey::from(pkr.min_inclusive.as_str()), - max_exclusive: EffectivePartitionKey::from(pkr.max_exclusive.as_str()), - }) - } - - /// Builds the JSON wire-format representation for serialization. - fn to_json(&self) -> FeedRangeJson { - FeedRangeJson { - range: RangeJson { - min: self.min_inclusive.as_str().to_owned(), - max: self.max_exclusive.as_str().to_owned(), - is_min_inclusive: true, - is_max_inclusive: false, - }, - } - } - - /// Validates and constructs a `FeedRange` from deserialized JSON fields. - /// - /// Checks inclusivity flags and min ≤ max ordering. - fn from_json(json: FeedRangeJson) -> azure_core::Result { - if !json.range.is_min_inclusive || json.range.is_max_inclusive { - return Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::DataConversion, - "feed range must have [min, max) semantics (isMinInclusive=true, isMaxInclusive=false)", - )); - } - - let min = EffectivePartitionKey::from(json.range.min); - let max = EffectivePartitionKey::from(json.range.max); - - if min > max { - return Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::DataConversion, - "feed range min must be less than or equal to max", - )); - } - - Ok(Self { - min_inclusive: min, - max_exclusive: max, - }) - } - /// Converts this SDK `FeedRange` into the driver's `FeedRange` type. - /// - /// The driver's `FeedRange` is used internally for pipeline routing and - /// does not carry serialization logic. - #[allow( - dead_code, - reason = "will be used when query/change-feed operations target feed ranges" - )] - pub(crate) fn to_driver_feed_range(&self) -> azure_data_cosmos_driver::models::FeedRange { - azure_data_cosmos_driver::models::FeedRange::new( - azure_data_cosmos_driver::models::effective_partition_key::EffectivePartitionKey::from( - self.min_inclusive.as_str(), - ), - azure_data_cosmos_driver::models::effective_partition_key::EffectivePartitionKey::from( - self.max_exclusive.as_str(), - ), - ) - } - - /// Creates an SDK `FeedRange` from the driver's `FeedRange` type. - #[allow( - dead_code, - reason = "will be used when query/change-feed operations target feed ranges" - )] - pub(crate) fn from_driver_feed_range( - driver_range: &azure_data_cosmos_driver::models::FeedRange, - ) -> Self { - Self { - min_inclusive: EffectivePartitionKey::from(driver_range.min_inclusive().as_str()), - max_exclusive: EffectivePartitionKey::from(driver_range.max_exclusive().as_str()), - } - } -} - -impl fmt::Display for FeedRange { - /// Formats this feed range as a base64-encoded JSON string. - /// - /// The output is compatible with other Azure Cosmos DB SDKs and can be - /// parsed back using [`std::str::FromStr`]. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let json_str = serde_json::to_string(&self.to_json()).map_err(|_| fmt::Error)?; - let encoded = base64::engine::general_purpose::STANDARD.encode(json_str.as_bytes()); - f.write_str(&encoded) - } -} - -impl FromStr for FeedRange { - type Err = azure_core::Error; - - /// Parses a feed range from a base64-encoded JSON string. - /// - /// The input should be a string produced by [`std::fmt::Display`] or by another Azure Cosmos DB SDK. - fn from_str(s: &str) -> Result { - let decoded_bytes = base64::engine::general_purpose::STANDARD - .decode(s) - .map_err(|e| azure_core::Error::new(azure_core::error::ErrorKind::DataConversion, e))?; - - let json: FeedRangeJson = serde_json::from_slice(&decoded_bytes) - .map_err(|e| azure_core::Error::new(azure_core::error::ErrorKind::DataConversion, e))?; - - Self::from_json(json) - } -} - -impl Serialize for FeedRange { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.to_json().serialize(serializer) - } -} - -impl<'de> Deserialize<'de> for FeedRange { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let json = FeedRangeJson::deserialize(deserializer)?; - Self::from_json(json).map_err(|e| serde::de::Error::custom(e.to_string())) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn full_range() { - let full = FeedRange::full(); - assert_eq!(full.min_inclusive.as_str(), ""); - assert_eq!(full.max_exclusive.as_str(), "FF"); - } - - #[test] - fn is_subset_of_full() { - let full = FeedRange::full(); - let sub = FeedRange { - min_inclusive: EffectivePartitionKey::from("00"), - max_exclusive: EffectivePartitionKey::from("80"), - }; - assert!(sub.is_subset_of(&full)); - assert!(!full.is_subset_of(&sub)); - } - - #[test] - fn is_subset_of_self() { - let range = FeedRange { - min_inclusive: EffectivePartitionKey::from("20"), - max_exclusive: EffectivePartitionKey::from("80"), - }; - assert!(range.is_subset_of(&range)); - } - - #[test] - fn overlaps_basic() { - let a = FeedRange { - min_inclusive: EffectivePartitionKey::from("00"), - max_exclusive: EffectivePartitionKey::from("50"), - }; - let b = FeedRange { - min_inclusive: EffectivePartitionKey::from("30"), - max_exclusive: EffectivePartitionKey::from("80"), - }; - assert!(a.overlaps(&b)); - assert!(b.overlaps(&a)); - } - - #[test] - fn overlaps_adjacent_no_overlap() { - let a = FeedRange { - min_inclusive: EffectivePartitionKey::from("00"), - max_exclusive: EffectivePartitionKey::from("50"), - }; - let b = FeedRange { - min_inclusive: EffectivePartitionKey::from("50"), - max_exclusive: EffectivePartitionKey::from("FF"), - }; - // Adjacent ranges (a's max == b's min) do NOT overlap because max is exclusive - assert!(!a.overlaps(&b)); - assert!(!b.overlaps(&a)); - } - - #[test] - fn overlaps_disjoint() { - let a = FeedRange { - min_inclusive: EffectivePartitionKey::from("00"), - max_exclusive: EffectivePartitionKey::from("30"), - }; - let b = FeedRange { - min_inclusive: EffectivePartitionKey::from("50"), - max_exclusive: EffectivePartitionKey::from("FF"), - }; - assert!(!a.overlaps(&b)); - assert!(!b.overlaps(&a)); - } - - #[test] - fn display_produces_expected_base64_full_range() { - let range = FeedRange { - min_inclusive: EffectivePartitionKey::from(""), - max_exclusive: EffectivePartitionKey::from("FF"), - }; - assert_eq!( - range.to_string(), - "eyJSYW5nZSI6eyJtaW4iOiIiLCJtYXgiOiJGRiIsImlzTWluSW5jbHVzaXZlIjp0cnVlLCJpc01heEluY2x1c2l2ZSI6ZmFsc2V9fQ==" - ); - } - - #[test] - fn display_produces_expected_base64_sub_range() { - let range = FeedRange { - min_inclusive: EffectivePartitionKey::from("3FFFFFFFFFFF"), - max_exclusive: EffectivePartitionKey::from("7FFFFFFFFFFF"), - }; - assert_eq!( - range.to_string(), - "eyJSYW5nZSI6eyJtaW4iOiIzRkZGRkZGRkZGRkYiLCJtYXgiOiI3RkZGRkZGRkZGRkYiLCJpc01pbkluY2x1c2l2ZSI6dHJ1ZSwiaXNNYXhJbmNsdXNpdmUiOmZhbHNlfX0=" - ); - } - - #[test] - fn from_str_parses_full_range() { - let input = "eyJSYW5nZSI6eyJtaW4iOiIiLCJtYXgiOiJGRiIsImlzTWluSW5jbHVzaXZlIjp0cnVlLCJpc01heEluY2x1c2l2ZSI6ZmFsc2V9fQ=="; - let range: FeedRange = input.parse().unwrap(); - assert_eq!(range.min_inclusive.as_str(), ""); - assert_eq!(range.max_exclusive.as_str(), "FF"); - } - - #[test] - fn from_str_parses_sub_range() { - let input = "eyJSYW5nZSI6eyJtaW4iOiIzRkZGRkZGRkZGRkYiLCJtYXgiOiI3RkZGRkZGRkZGRkYiLCJpc01pbkluY2x1c2l2ZSI6dHJ1ZSwiaXNNYXhJbmNsdXNpdmUiOmZhbHNlfX0="; - let range: FeedRange = input.parse().unwrap(); - assert_eq!(range.min_inclusive.as_str(), "3FFFFFFFFFFF"); - assert_eq!(range.max_exclusive.as_str(), "7FFFFFFFFFFF"); - } - - #[test] - fn serde_json_serializes_to_cross_sdk_format() { - let range = FeedRange { - min_inclusive: EffectivePartitionKey::from(""), - max_exclusive: EffectivePartitionKey::from("FF"), - }; - let json = serde_json::to_string(&range).unwrap(); - - let value: serde_json::Value = serde_json::from_str(&json).unwrap(); - let inner = value.get("Range").expect("expected 'Range' key"); - assert_eq!(inner.get("min").unwrap().as_str().unwrap(), ""); - assert_eq!(inner.get("max").unwrap().as_str().unwrap(), "FF"); - assert!(inner.get("isMinInclusive").unwrap().as_bool().unwrap()); - assert!(!inner.get("isMaxInclusive").unwrap().as_bool().unwrap()); - } - - #[test] - fn serde_json_deserializes_cross_sdk_format() { - let json = - r#"{"Range":{"min":"","max":"FF","isMinInclusive":true,"isMaxInclusive":false}}"#; - let range: FeedRange = serde_json::from_str(json).unwrap(); - assert_eq!(range.min_inclusive.as_str(), ""); - assert_eq!(range.max_exclusive.as_str(), "FF"); - } - - #[test] - fn from_str_invalid_base64() { - let result = "not-valid-base64!!!".parse::(); - assert!(result.is_err()); - } - - #[test] - fn from_str_invalid_json() { - let encoded = base64::engine::general_purpose::STANDARD.encode(b"not json"); - let result = encoded.parse::(); - assert!(result.is_err()); - } - - #[test] - fn from_partition_key_range() { - let pkr = PartitionKeyRange::new("0".to_string(), "".to_string(), "FF".to_string()); - let feed_range = FeedRange::from_partition_key_range(&pkr).unwrap(); - assert_eq!(feed_range.min_inclusive.as_str(), ""); - assert_eq!(feed_range.max_exclusive.as_str(), "FF"); - } - - #[test] - fn cross_sdk_compatibility() { - // Verify that the full range serializes to the same base64 string regardless of platform - let full = FeedRange::full(); - let serialized = full.to_string(); - - // Decode and verify the JSON structure - let decoded = base64::engine::general_purpose::STANDARD - .decode(&serialized) - .unwrap(); - let json: serde_json::Value = serde_json::from_slice(&decoded).unwrap(); - - let range = json.get("Range").unwrap(); - assert_eq!(range.get("min").unwrap().as_str().unwrap(), ""); - assert_eq!(range.get("max").unwrap().as_str().unwrap(), "FF"); - assert!(range.get("isMinInclusive").unwrap().as_bool().unwrap()); - assert!(!range.get("isMaxInclusive").unwrap().as_bool().unwrap()); - } - - #[test] - fn from_str_rejects_max_inclusive() { - let json = r#"{"Range":{"min":"","max":"FF","isMinInclusive":true,"isMaxInclusive":true}}"#; - let encoded = base64::engine::general_purpose::STANDARD.encode(json.as_bytes()); - assert!(encoded.parse::().is_err()); - } - - #[test] - fn serde_rejects_min_not_inclusive() { - let json = - r#"{"Range":{"min":"","max":"FF","isMinInclusive":false,"isMaxInclusive":false}}"#; - assert!(serde_json::from_str::(json).is_err()); - } - - #[test] - fn from_str_rejects_inverted_range() { - let json = - r#"{"Range":{"min":"FF","max":"","isMinInclusive":true,"isMaxInclusive":false}}"#; - let encoded = base64::engine::general_purpose::STANDARD.encode(json.as_bytes()); - assert!(encoded.parse::().is_err()); - } - - #[test] - fn serde_rejects_inverted_range() { - let json = - r#"{"Range":{"min":"FF","max":"","isMinInclusive":true,"isMaxInclusive":false}}"#; - assert!(serde_json::from_str::(json).is_err()); - } -} diff --git a/sdk/cosmos/azure_data_cosmos/src/lib.rs b/sdk/cosmos/azure_data_cosmos/src/lib.rs index 12da0cbd9b9..f1cb2a10aa8 100644 --- a/sdk/cosmos/azure_data_cosmos/src/lib.rs +++ b/sdk/cosmos/azure_data_cosmos/src/lib.rs @@ -11,7 +11,6 @@ mod connection_string; pub mod constants; mod credential; mod feed; -mod feed_range; pub mod options; mod partition_key; pub(crate) mod pipeline; @@ -30,6 +29,8 @@ pub use clients::CosmosClientBuilder; pub use account_endpoint::CosmosAccountEndpoint; pub use account_reference::CosmosAccountReference; +#[doc(inline)] +pub use azure_data_cosmos_driver::models::FeedRange; pub use clients::ThroughputPoller; pub use connection_string::*; pub use credential::CosmosCredential; @@ -44,7 +45,6 @@ pub use transactional_batch::{ }; pub use feed::{FeedItemIterator, FeedPage, FeedPageIterator, QueryFeedPage}; -pub use feed_range::FeedRange; mod background_task_manager; mod cosmos_request; mod driver_bridge; diff --git a/sdk/cosmos/azure_data_cosmos/src/session_helpers.rs b/sdk/cosmos/azure_data_cosmos/src/session_helpers.rs index 03211351e81..d83375b9070 100644 --- a/sdk/cosmos/azure_data_cosmos/src/session_helpers.rs +++ b/sdk/cosmos/azure_data_cosmos/src/session_helpers.rs @@ -3,7 +3,7 @@ //! Helpers for merging and managing session tokens across feed ranges. -use crate::feed_range::FeedRange; +use crate::FeedRange; use azure_core::error::ErrorKind; use azure_data_cosmos_driver::models::{SessionToken, SessionTokenSegment}; @@ -99,9 +99,9 @@ fn merge_ranges_with_subsets( // Sort by range size descending: larger ranges (parents) first. // Primary: max_exclusive descending, secondary: min_inclusive ascending. overlapping.sort_by(|(a, _), (b, _)| { - b.max_exclusive - .cmp(&a.max_exclusive) - .then(a.min_inclusive.cmp(&b.min_inclusive)) + b.max_exclusive() + .cmp(a.max_exclusive()) + .then(a.min_inclusive().cmp(b.min_inclusive())) }); let mut processed = Vec::new(); @@ -188,7 +188,7 @@ fn analyze_subsets( ) -> azure_core::Result { // Sort subsets by min_inclusive so adjacent children are always in order let mut sorted_subsets = subsets.to_vec(); - sorted_subsets.sort_by(|a, b| a.1.min_inclusive.cmp(&b.1.min_inclusive)); + sorted_subsets.sort_by(|a, b| a.1.min_inclusive().cmp(b.1.min_inclusive())); for start_idx in 0..sorted_subsets.len() { let mut merged_range = sorted_subsets[start_idx].1.clone(); @@ -354,13 +354,10 @@ pub(crate) fn get_latest_session_token( #[cfg(test)] mod tests { use super::*; - use crate::hash::EffectivePartitionKey; + use azure_data_cosmos_driver::models::effective_partition_key::EffectivePartitionKey as DriverEpk; fn fr(min: &str, max: &str) -> FeedRange { - FeedRange { - min_inclusive: EffectivePartitionKey::from(min), - max_exclusive: EffectivePartitionKey::from(max), - } + FeedRange::new(DriverEpk::from(min), DriverEpk::from(max)) } fn st(s: &str) -> SessionToken { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs index 12c37d8b1c7..8568d64d207 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs @@ -5,8 +5,16 @@ //! //! A [`FeedRange`] represents a contiguous range of the effective partition key (EPK) space. //! It is used by the dataflow pipeline to target operations at one or more physical partitions. +//! +//! Feed ranges can also be serialized to base64-encoded JSON for cross-SDK storage and transport. + +use azure_core::{error::ErrorKind, fmt::SafeDebug}; +use base64::Engine; +use serde::{Deserialize, Serialize}; +use std::{cmp, fmt, str::FromStr}; use crate::models::effective_partition_key::EffectivePartitionKey; +use crate::models::partition_key_range::PartitionKeyRange; /// A contiguous range of the effective partition key space. /// @@ -15,12 +23,28 @@ use crate::models::effective_partition_key::EffectivePartitionKey; /// topology. /// /// Use [`FeedRange::full()`] for the entire key space (`""..FF`). -#[derive(Clone, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, SafeDebug, PartialEq, Eq, Hash)] pub struct FeedRange { min_inclusive: EffectivePartitionKey, max_exclusive: EffectivePartitionKey, } +#[derive(Serialize, Deserialize)] +struct FeedRangeJson { + #[serde(rename = "Range")] + range: RangeJson, +} + +#[derive(Serialize, Deserialize)] +struct RangeJson { + min: String, + max: String, + #[serde(rename = "isMinInclusive")] + is_min_inclusive: bool, + #[serde(rename = "isMaxInclusive")] + is_max_inclusive: bool, +} + impl FeedRange { /// Creates a feed range from explicit EPK bounds. pub fn new(min_inclusive: EffectivePartitionKey, max_exclusive: EffectivePartitionKey) -> Self { @@ -59,6 +83,127 @@ impl FeedRange { pub fn overlaps(&self, other: &FeedRange) -> bool { self.min_inclusive < other.max_exclusive && other.min_inclusive < self.max_exclusive } + + /// Returns `true` if this feed range can be combined with `other`. + /// + /// Two ranges can be combined when they overlap or are adjacent + /// (one's max equals the other's min). + pub fn can_merge(&self, other: &FeedRange) -> bool { + self.max_exclusive >= other.min_inclusive && other.max_exclusive >= self.min_inclusive + } + + /// Combines this feed range with `other` into a bounding range. + pub fn merge_with(&self, other: &FeedRange) -> FeedRange { + debug_assert!( + self.can_merge(other), + "merge_with called on disjoint ranges" + ); + FeedRange { + min_inclusive: cmp::min(self.min_inclusive.clone(), other.min_inclusive.clone()), + max_exclusive: cmp::max(self.max_exclusive.clone(), other.max_exclusive.clone()), + } + } + + fn to_json(&self) -> FeedRangeJson { + FeedRangeJson { + range: RangeJson { + min: self.min_inclusive.as_str().to_owned(), + max: self.max_exclusive.as_str().to_owned(), + is_min_inclusive: true, + is_max_inclusive: false, + }, + } + } + + fn from_json(json: FeedRangeJson) -> azure_core::Result { + if !json.range.is_min_inclusive || json.range.is_max_inclusive { + return Err(azure_core::Error::with_message( + ErrorKind::DataConversion, + "feed range must have [min, max) semantics (isMinInclusive=true, isMaxInclusive=false)", + )); + } + + let min = EffectivePartitionKey::from(json.range.min); + let max = EffectivePartitionKey::from(json.range.max); + + if min > max { + return Err(azure_core::Error::with_message( + ErrorKind::DataConversion, + "feed range min must be less than or equal to max", + )); + } + + Ok(Self { + min_inclusive: min, + max_exclusive: max, + }) + } +} + +impl TryFrom<&PartitionKeyRange> for FeedRange { + type Error = azure_core::Error; + + /// Creates a `FeedRange` from a driver `PartitionKeyRange`. + /// + /// Partition key ranges from the service always use `[min, max)` semantics + /// (min inclusive, max exclusive). Returns an error if the range is inverted. + fn try_from(pkr: &PartitionKeyRange) -> Result { + if pkr.min_inclusive > pkr.max_exclusive { + return Err(azure_core::Error::with_message( + ErrorKind::DataConversion, + "partition key range min_inclusive must be <= max_exclusive", + )); + } + + Ok(Self { + min_inclusive: EffectivePartitionKey::from(pkr.min_inclusive.as_str()), + max_exclusive: EffectivePartitionKey::from(pkr.max_exclusive.as_str()), + }) + } +} + +impl fmt::Display for FeedRange { + /// Formats this feed range as a base64-encoded JSON string. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let json_str = serde_json::to_string(&self.to_json()).map_err(|_| fmt::Error)?; + let encoded = base64::engine::general_purpose::STANDARD.encode(json_str.as_bytes()); + f.write_str(&encoded) + } +} + +impl FromStr for FeedRange { + type Err = azure_core::Error; + + /// Parses a feed range from a base64-encoded JSON string. + fn from_str(s: &str) -> Result { + let decoded_bytes = base64::engine::general_purpose::STANDARD + .decode(s) + .map_err(|e| azure_core::Error::new(ErrorKind::DataConversion, e))?; + + let json: FeedRangeJson = serde_json::from_slice(&decoded_bytes) + .map_err(|e| azure_core::Error::new(ErrorKind::DataConversion, e))?; + + Self::from_json(json) + } +} + +impl Serialize for FeedRange { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.to_json().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for FeedRange { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let json = FeedRangeJson::deserialize(deserializer)?; + Self::from_json(json).map_err(|e| serde::de::Error::custom(e.to_string())) + } } #[cfg(test)] @@ -116,7 +261,6 @@ mod tests { EffectivePartitionKey::from("50"), EffectivePartitionKey::from("FF"), ); - // Adjacent ranges (a's max == b's min) do NOT overlap because max is exclusive. assert!(!a.overlaps(&b)); assert!(!b.overlaps(&a)); } @@ -134,4 +278,110 @@ mod tests { assert!(!a.overlaps(&b)); assert!(!b.overlaps(&a)); } + + #[test] + fn can_merge_adjacent() { + let a = FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("50"), + ); + let b = FeedRange::new( + EffectivePartitionKey::from("50"), + EffectivePartitionKey::from("FF"), + ); + + assert!(a.can_merge(&b)); + assert!(b.can_merge(&a)); + } + + #[test] + fn merge_with_bounds() { + let a = FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("50"), + ); + let b = FeedRange::new( + EffectivePartitionKey::from("30"), + EffectivePartitionKey::from("FF"), + ); + + let merged = a.merge_with(&b); + assert_eq!(merged.min_inclusive().as_str(), "00"); + assert_eq!(merged.max_exclusive().as_str(), "FF"); + } + + #[test] + fn display_round_trip() { + let range = FeedRange::new( + EffectivePartitionKey::from("3FFFFFFFFFFF"), + EffectivePartitionKey::from("7FFFFFFFFFFF"), + ); + + let serialized = range.to_string(); + let parsed: FeedRange = serialized.parse().unwrap(); + + assert_eq!(parsed, range); + } + + #[test] + fn serde_json_round_trip() { + let range = FeedRange::new( + EffectivePartitionKey::from(""), + EffectivePartitionKey::from("FF"), + ); + + let json = serde_json::to_string(&range).unwrap(); + let parsed: FeedRange = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed, range); + } + + #[test] + fn try_from_partition_key_range() { + let pkr = PartitionKeyRange::new("0".to_string(), "".to_string(), "FF".to_string()); + let feed_range = FeedRange::try_from(&pkr).unwrap(); + + assert_eq!(feed_range.min_inclusive().as_str(), ""); + assert_eq!(feed_range.max_exclusive().as_str(), "FF"); + } + + #[test] + fn from_str_invalid_base64() { + assert!("not-valid-base64!!!".parse::().is_err()); + } + + #[test] + fn from_str_invalid_json() { + let encoded = base64::engine::general_purpose::STANDARD.encode(b"not json"); + assert!(encoded.parse::().is_err()); + } + + #[test] + fn from_str_rejects_max_inclusive() { + let json = r#"{"Range":{"min":"","max":"FF","isMinInclusive":true,"isMaxInclusive":true}}"#; + let encoded = base64::engine::general_purpose::STANDARD.encode(json.as_bytes()); + assert!(encoded.parse::().is_err()); + } + + #[test] + fn serde_rejects_min_not_inclusive() { + let json = + r#"{"Range":{"min":"","max":"FF","isMinInclusive":false,"isMaxInclusive":false}}"#; + assert!(serde_json::from_str::(json).is_err()); + } + + #[test] + fn from_str_rejects_inverted_range() { + let json = + r#"{"Range":{"min":"FF","max":"","isMinInclusive":true,"isMaxInclusive":false}}"#; + let encoded = base64::engine::general_purpose::STANDARD.encode(json.as_bytes()); + assert!(encoded.parse::().is_err()); + } + + #[test] + fn serde_rejects_inverted_range() { + let json = + r#"{"Range":{"min":"FF","max":"","isMinInclusive":true,"isMaxInclusive":false}}"#; + assert!(serde_json::from_str::(json).is_err()); + } } From 61dc4495435d15157e843d7e004f143e37ab962c Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 17:18:03 +0000 Subject: [PATCH 23/29] Move EPK and hashing from SDK to driver --- sdk/cosmos/azure_data_cosmos/src/hash.rs | 470 ++++------------------- sdk/cosmos/azure_data_cosmos/src/lib.rs | 3 +- 2 files changed, 68 insertions(+), 405 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos/src/hash.rs b/sdk/cosmos/azure_data_cosmos/src/hash.rs index 5a0bce076a6..16dd6ab6f3f 100644 --- a/sdk/cosmos/azure_data_cosmos/src/hash.rs +++ b/sdk/cosmos/azure_data_cosmos/src/hash.rs @@ -2,61 +2,51 @@ // Licensed under the MIT License. use crate::models::PartitionKeyKind; -use crate::murmur_hash::{murmurhash3_128, murmurhash3_32}; -use std::fmt::Write; - -const MAX_STRING_BYTES_TO_APPEND: usize = 100; -pub(crate) const MIN_INCLUSIVE_EFFECTIVE_PARTITION_KEY: &str = ""; -pub(crate) const MAX_EXCLUSIVE_EFFECTIVE_PARTITION_KEY: &str = "FF"; +use azure_data_cosmos_driver::models::{ + effective_partition_key::EffectivePartitionKey as DriverEffectivePartitionKey, + PartitionKeyValue as DriverPartitionKeyValue, PartitionKeyVersion, +}; +use std::fmt; /// A strongly-typed wrapper around the hex-encoded effective partition key string. /// -/// Use [`AsRef`] to obtain the underlying string when passing to APIs -/// that accept `&str`. -/// -/// Ordering is lexicographic on the underlying hex string. This is correct because: -/// - All actual EPK hash values are uppercase hex strings of consistent length -/// - The sentinel MAX ("FF") sorts after all real hashes by the Cosmos DB EPK space design -/// - The sentinel MIN ("") sorts before everything +/// This SDK type wraps the driver's canonical EPK implementation while keeping +/// the SDK's public API surface explicit and stable. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct EffectivePartitionKey(String); +pub struct EffectivePartitionKey(DriverEffectivePartitionKey); impl EffectivePartitionKey { /// Returns the underlying string representation. pub fn as_str(&self) -> &str { - &self.0 + self.0.as_str() + } +} + +impl fmt::Display for EffectivePartitionKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) } } impl AsRef for EffectivePartitionKey { fn as_ref(&self) -> &str { - &self.0 + self.0.as_ref() } } impl From for EffectivePartitionKey { - fn from(s: String) -> Self { - Self(s) + fn from(value: String) -> Self { + Self(DriverEffectivePartitionKey::from(value)) } } impl From<&str> for EffectivePartitionKey { - fn from(s: &str) -> Self { - Self(s.to_owned()) + fn from(value: &str) -> Self { + Self(DriverEffectivePartitionKey::from(value)) } } -/// Contains all allowed markers for component marker types. -mod component { - pub const UNDEFINED: u8 = 0x00; - pub const NULL: u8 = 0x01; - pub const BOOL_FALSE: u8 = 0x02; - pub const BOOL_TRUE: u8 = 0x03; - pub const NUMBER: u8 = 0x05; - pub const STRING: u8 = 0x08; - pub const INFINITY: u8 = 0xFF; -} - +/// Internal representation used by SDK partition key APIs. #[derive(Clone, Debug, PartialEq)] pub enum InnerPartitionKeyValue { Null, @@ -67,280 +57,54 @@ pub enum InnerPartitionKeyValue { Undefined, } -// `f64` does not implement `Eq`, but in this domain partition key numbers are -// always finite, non-NaN values, so total equality holds. We implement `Eq` -// manually to express this invariant. impl Eq for InnerPartitionKeyValue {} -impl InnerPartitionKeyValue { - /// Common hashing writer core: writes type marker + payload (string suffix used by V2). - fn write_for_hashing_core(&self, string_suffix: u8, writer: &mut Vec, truncate: bool) { - match self { - InnerPartitionKeyValue::Bool(true) => writer.push(component::BOOL_TRUE), - InnerPartitionKeyValue::Bool(false) => writer.push(component::BOOL_FALSE), - InnerPartitionKeyValue::Null => writer.push(component::NULL), - InnerPartitionKeyValue::Number(n) => { - writer.push(component::NUMBER); // Number marker - let bytes = n.to_le_bytes(); - writer.extend_from_slice(&bytes); - } - InnerPartitionKeyValue::String(s) => { - writer.push(component::STRING); // String marker - let bytes = s.as_bytes(); - if truncate && bytes.len() > MAX_STRING_BYTES_TO_APPEND { - writer.extend_from_slice(&bytes[..MAX_STRING_BYTES_TO_APPEND]); - } else { - writer.extend_from_slice(bytes); - } - writer.push(string_suffix); - } - InnerPartitionKeyValue::Undefined => writer.push(component::UNDEFINED), - InnerPartitionKeyValue::Infinity => writer.push(component::INFINITY), - } - } - - /// V1 hashing wrapper (string suffix 0x00) - pub fn write_for_hashing_v1(&self, writer: &mut Vec) { - self.write_for_hashing_core(0x00u8, writer, true) - } - - /// V2 hashing wrapper (string suffix 0xFF) - pub fn write_for_hashing_v2(&self, writer: &mut Vec) { - self.write_for_hashing_core(0xFFu8, writer, false) - } - - /// V1 binary encoding (subset required for test cases): - /// * Bool -> marker (0x03 true / 0x02 false) - /// * Number -> marker (0x05) + variable-length 64-bit ordering-preserving encoding - /// * String -> marker (0x08) + each byte+1 (no 0xFF guard) up to 100 or 101 (if truncated) then 0x00 terminator if short - /// * Undefined -> marker (0x00) - /// * Null -> marker (0x01). - pub fn write_for_binary_encoding_v1(&self, writer: &mut Vec) { - match self { - InnerPartitionKeyValue::Bool(true) => writer.push(component::BOOL_TRUE), - InnerPartitionKeyValue::Bool(false) => writer.push(component::BOOL_FALSE), - InnerPartitionKeyValue::Infinity => writer.push(component::INFINITY), - InnerPartitionKeyValue::Number(n) => { - writer.push(component::NUMBER); - let mut payload = encode_double_as_uint64(*n); - // First 8 bits - writer.push((payload >> 56) as u8); - payload <<= 8; - let mut first = true; - let mut byte_to_write: u8 = 0; - while payload != 0 { - if !first { - writer.push(byte_to_write); - } else { - first = false; - } - byte_to_write = ((payload >> 56) as u8) | 0x01; // set continuation bit - payload <<= 7; // consume 7 bits (since we used 7 data bits + 1 flag) - } - writer.push(byte_to_write & 0xFE); // last byte with 0 flag - } - InnerPartitionKeyValue::String(s) => { - writer.push(component::STRING); - let utf8 = s.as_bytes(); - let short = utf8.len() <= MAX_STRING_BYTES_TO_APPEND; - // Use std::cmp to determine truncated write length (include sentinel +1 when longer than max) - let write_len = if short { - utf8.len() - } else { - std::cmp::min(utf8.len(), MAX_STRING_BYTES_TO_APPEND + 1) - }; - for item in utf8.iter().take(write_len) { - let b = item.wrapping_add(1); // unconditional +1 - writer.push(b); - } - if short { - writer.push(0x00); - } - } - InnerPartitionKeyValue::Undefined => writer.push(component::UNDEFINED), - InnerPartitionKeyValue::Null => writer.push(component::NULL), - } - } - - /// Binary encoding used by `_to_hex_encoded_binary_string`. - pub fn write_for_binary_encoding(&self, writer: &mut Vec) { - match self { - InnerPartitionKeyValue::Bool(true) => writer.push(component::BOOL_TRUE), - InnerPartitionKeyValue::Bool(false) => writer.push(component::BOOL_FALSE), - InnerPartitionKeyValue::Infinity => writer.push(component::INFINITY), - InnerPartitionKeyValue::Number(n) => { - writer.push(component::NUMBER); - // use IEEE754 little-endian double representation - writer.extend_from_slice(&n.to_le_bytes()); - } - InnerPartitionKeyValue::String(s) => { - writer.push(component::STRING); - let utf8 = s.as_bytes(); - let size = std::cmp::min(utf8.len(), MAX_STRING_BYTES_TO_APPEND); - let short_string: bool; - let write_len = if size == MAX_STRING_BYTES_TO_APPEND { - short_string = false; - size + 1 - } else { - short_string = true; - size - }; - for item in utf8.iter().take(write_len) { - let mut b = *item; - if b < 0xFF { - b = b.wrapping_add(1); - } - writer.push(b); - } - if short_string { - writer.push(0x00); - } - } - InnerPartitionKeyValue::Undefined => writer.push(component::UNDEFINED), - InnerPartitionKeyValue::Null => writer.push(component::NULL), - } +fn to_driver_partition_key_value(value: &InnerPartitionKeyValue) -> DriverPartitionKeyValue { + match value { + InnerPartitionKeyValue::Null => DriverPartitionKeyValue::from(Option::::None), + InnerPartitionKeyValue::Bool(b) => DriverPartitionKeyValue::from(*b), + InnerPartitionKeyValue::Number(n) => DriverPartitionKeyValue::from(*n), + InnerPartitionKeyValue::String(s) => DriverPartitionKeyValue::from(s.clone()), + InnerPartitionKeyValue::Undefined => DriverPartitionKeyValue::undefined(), + InnerPartitionKeyValue::Infinity => DriverPartitionKeyValue::from(Option::::None), } } /// Returns an [`EffectivePartitionKey`] representing the hashed partition key. +/// +/// Versions 1 and 2 map directly to the driver's partition key version enum. +/// Any other version falls back to V2 for forward-compatible behavior. pub fn get_hashed_partition_key_string( pk_value: &[&InnerPartitionKeyValue], kind: PartitionKeyKind, version: u8, ) -> EffectivePartitionKey { if pk_value.is_empty() { - return EffectivePartitionKey(MIN_INCLUSIVE_EFFECTIVE_PARTITION_KEY.to_string()); + return EffectivePartitionKey(DriverEffectivePartitionKey::min()); } + if pk_value.len() == 1 && *pk_value[0] == InnerPartitionKeyValue::Infinity { - return EffectivePartitionKey(MAX_EXCLUSIVE_EFFECTIVE_PARTITION_KEY.to_string()); + return EffectivePartitionKey(DriverEffectivePartitionKey::max()); } - let raw = match kind { - PartitionKeyKind::Hash => match version { - 1 => get_effective_partition_key_for_hash_partitioning_v1(pk_value), - 2 => get_effective_partition_key_for_hash_partitioning_v2(pk_value), - _ => { - tracing::warn!( - "Hash partitioning version {} is not supported, falling back to binary encoding.", - version - ); - to_hex_encoded_binary_string(pk_value) - } - }, - PartitionKeyKind::MultiHash => { - // MultiHash is not yet implemented; use the non-hashed binary encoding - // as a deterministic fallback instead of panicking. - tracing::warn!( - "MultiHash partitioning is not yet supported, falling back to binary encoding." - ); - to_hex_encoded_binary_string(pk_value) - } - _ => { + let driver_values: Vec = pk_value + .iter() + .map(|value| to_driver_partition_key_value(value)) + .collect(); + + let version = match version { + 1 => PartitionKeyVersion::V1, + 2 => PartitionKeyVersion::V2, + unsupported => { tracing::warn!( - "Unknown partition key kind '{:?}', falling back to binary encoding.", - kind + "Partition key hashing version {} is unsupported in SDK API; defaulting to V2", + unsupported ); - to_hex_encoded_binary_string(pk_value) + PartitionKeyVersion::V2 } }; - EffectivePartitionKey(raw) -} - -/// V2: encode components with `_write_for_hashing_v2`, hash the concatenated bytes, -fn get_effective_partition_key_for_hash_partitioning_v2( - pk_value: &[&InnerPartitionKeyValue], -) -> String { - let mut ms: Vec = Vec::new(); - for comp in pk_value { - comp.write_for_hashing_v2(&mut ms); - } - let hash_128 = murmurhash3_128(&ms, 0); - let mut hash_bytes = hash_128.to_le_bytes(); - hash_bytes.reverse(); - // Reset 2 most significant bits of first byte - hash_bytes[0] &= 0x3F; - bytes_to_hex_upper(&hash_bytes) -} - -/// V1: compute 32-bit murmur hash over concatenated component encodings (suffix 0x00 for strings), -/// convert hash (u32) to f64 (possible precision loss is intentional to mirror other sdks), then binary-encode -/// [hash_value_as_number] + truncated original components using V1 binary rules. -fn get_effective_partition_key_for_hash_partitioning_v1( - pk_value: &[&InnerPartitionKeyValue], -) -> String { - // Build hashing buffer using V1 hashing encoding (truncation is handled by write_for_hashing_v1) - let mut hashing_bytes: Vec = Vec::new(); - for v in pk_value { - v.write_for_hashing_v1(&mut hashing_bytes); - } - - let hash32 = murmurhash3_32(&hashing_bytes, 0u32); - let hash_value_f64 = hash32 as f64; // casts UInt32 -> float (lossy above 2^24) - - // For the binary encoding step, strings must also be truncated to match - // the truncation applied during hashing. - let hash_component = InnerPartitionKeyValue::Number(hash_value_f64); - let truncated_values: Vec = pk_value - .iter() - .map(|v| match v { - InnerPartitionKeyValue::String(s) if s.len() > MAX_STRING_BYTES_TO_APPEND => { - InnerPartitionKeyValue::String(s[..MAX_STRING_BYTES_TO_APPEND].to_string()) - } - other => (*other).clone(), - }) - .collect(); - - let mut components: Vec<&InnerPartitionKeyValue> = - Vec::with_capacity(truncated_values.len() + 1); - components.push(&hash_component); - components.extend(truncated_values.iter()); - - to_hex_encoded_binary_string_v1(&components) -} - -/// Encode multiple components into a binary buffer using V1 rules and return uppercase hex string. -fn to_hex_encoded_binary_string_v1(components: &[&InnerPartitionKeyValue]) -> String { - let mut buffer: Vec = Vec::new(); - for comp in components { - comp.write_for_binary_encoding_v1(&mut buffer); - } - bytes_to_hex_upper(&buffer) -} - -fn encode_double_as_uint64(value: f64) -> u64 { - let value_in_uint64 = u64::from_le_bytes(value.to_le_bytes()); - let mask: u64 = 0x8000_0000_0000_0000; - if value_in_uint64 < mask { - value_in_uint64 ^ mask - } else { - (!value_in_uint64).wrapping_add(1) - } -} -/// Encode multiple components into a binary buffer and return lowercase hex string. -/// This corresponds to `_to_hex_encoded_binary_string` + `_write_for_binary_encoding`. -fn to_hex_encoded_binary_string(components: &[&InnerPartitionKeyValue]) -> String { - let mut buffer: Vec = Vec::new(); - for comp in components { - comp.write_for_binary_encoding(&mut buffer); - } - bytes_to_hex_lower(&buffer) -} - -fn bytes_to_hex_upper(bytes: &[u8]) -> String { - let mut s = String::with_capacity(bytes.len() * 2); - for b in bytes { - write!(&mut s, "{:02X}", b).unwrap(); - } - s -} - -fn bytes_to_hex_lower(bytes: &[u8]) -> String { - let mut s = String::with_capacity(bytes.len() * 2); - for b in bytes { - write!(&mut s, "{:02x}", b).unwrap(); - } - s + EffectivePartitionKey(DriverEffectivePartitionKey::compute(&driver_values, kind, version)) } #[cfg(test)] @@ -348,38 +112,27 @@ mod tests { use super::*; #[test] - fn test_empty_pk() { + fn empty_pk_returns_min() { let result = get_hashed_partition_key_string(&[], PartitionKeyKind::Hash, 0); - assert_eq!(result.as_str(), MIN_INCLUSIVE_EFFECTIVE_PARTITION_KEY); + assert_eq!(result.as_str(), ""); } #[test] - fn test_infinity_pk() { + fn infinity_pk_returns_max() { let inf = InnerPartitionKeyValue::Infinity; let result = get_hashed_partition_key_string(&[&inf], PartitionKeyKind::Hash, 0); - assert_eq!(result.as_str(), MAX_EXCLUSIVE_EFFECTIVE_PARTITION_KEY); + assert_eq!(result.as_str(), "FF"); } #[test] - fn test_single_string_hash_v2() { + fn single_string_hash_v2_matches_baseline() { let comp = InnerPartitionKeyValue::String("customer42".to_string()); let result = get_hashed_partition_key_string(&[&comp], PartitionKeyKind::Hash, 2); - // result should be a hex string of length 32 (16 bytes * 2 chars) - assert_eq!(result.as_str().len(), 32); - assert_eq!( - result.as_str(), - "19819C94CE42A1654CCC8110539D9589", - "Mismatch for component hash" - ) + assert_eq!(result.as_str(), "19819C94CE42A1654CCC8110539D9589"); } #[test] - fn test_effective_partition_key_hash_v2() { - // Each entry represents a single-component partition key and the expected - // effective partition key hash (uppercase hex) for V2 hash partitioning. - let thousand_a = "a".repeat(1024); - - // Expected values taken from Java SDK tests. + fn effective_partition_key_hash_v2_examples() { let cases: Vec<(InnerPartitionKeyValue, &str)> = vec![ ( InnerPartitionKeyValue::String(String::from("")), @@ -389,66 +142,10 @@ mod tests { InnerPartitionKeyValue::String(String::from("partitionKey")), "013AEFCF77FA271571CF665A58C933F1", ), - ( - InnerPartitionKeyValue::String(thousand_a), - "332BDF5512AE49615F32C7D98C2DB86C", - ), - ( - InnerPartitionKeyValue::Null, - "378867E4430E67857ACE5C908374FE16", - ), - ( - InnerPartitionKeyValue::Undefined, - "11622DAA78F835834610ABE56EFF5CB5", - ), - ( - InnerPartitionKeyValue::Bool(true), - "0E711127C5B5A8E4726AC6DD306A3E59", - ), - ( - InnerPartitionKeyValue::Bool(false), - "2FE1BE91E90A3439635E0E9E37361EF2", - ), - ( - InnerPartitionKeyValue::Number(-128f64), - "01DAEDABF913540367FE219B2AD06148", - ), // Java Byte.MIN_VALUE - ( - InnerPartitionKeyValue::Number(127f64), - "0C507ACAC853ECA7977BF4CEFB562A25", - ), // Java Byte.MAX_VALUE - ( - InnerPartitionKeyValue::Number(i64::MIN as f64), - "23D5C6395512BDFEAFADAD15328AD2BB", - ), - ( - InnerPartitionKeyValue::Number(i64::MAX as f64), - "2EDB959178DFCCA18983F89384D1629B", - ), - ( - InnerPartitionKeyValue::Number(i32::MIN as f64), - "0B1660D5233C3171725B30D4A5F4CC1F", - ), - ( - InnerPartitionKeyValue::Number(i32::MAX as f64), - "2D9349D64712AEB5EB1406E2F0BE2725", - ), - ( - InnerPartitionKeyValue::Number(f64::from_bits(0x1)), - "0E6CBA63A280927DE485DEF865800139", - ), // Java Double.MIN_VALUE - ( - InnerPartitionKeyValue::Number(f64::MAX), - "31424D996457102634591FF245DBCC4D", - ), ( InnerPartitionKeyValue::Number(5.0), "19C08621B135968252FB34B4CF66F811", ), - ( - InnerPartitionKeyValue::Number(5.123_124_190_509_124), - "0EF2E2D82460884AF0F6440BE4F726A8", - ), ( InnerPartitionKeyValue::String(String::from("redmond")), "22E342F38A486A088463DFF7838A5963", @@ -457,63 +154,28 @@ mod tests { for (component, expected) in &cases { let actual = get_hashed_partition_key_string(&[component], PartitionKeyKind::Hash, 2); - assert_eq!(actual.as_str(), *expected, "Mismatch for component hash"); + assert_eq!(actual.as_str(), *expected, "Mismatch for V2 component hash"); } } #[test] - fn test_effective_partition_key_hash_v2_multiple_keys() { - let component: Vec = vec![ - InnerPartitionKeyValue::Number(5.0), - InnerPartitionKeyValue::String(String::from("redmond")), - InnerPartitionKeyValue::Bool(true), - InnerPartitionKeyValue::Null, - ]; - let expected = "3032DECBE2AB1768D8E0AEDEA35881DF"; - - let refs: Vec<&InnerPartitionKeyValue> = component.iter().collect(); - let actual = get_hashed_partition_key_string(&refs, PartitionKeyKind::Hash, 2); - assert_eq!(actual.as_str(), expected, "Mismatch for component hash"); - } - - #[test] - fn test_effective_partition_key_hash_v1() { - // Expected strings are the direct V1 effective partition key representations (uppercase hex). - let thousand_a = "a".repeat(1024); - - // Expected values taken from Java SDK tests. + fn effective_partition_key_hash_v1_examples() { let cases: Vec<(InnerPartitionKeyValue, &str)> = vec![ - (InnerPartitionKeyValue::String(String::from("")), "05C1CF33970FF80800"), - (InnerPartitionKeyValue::String(String::from("partitionKey")), "05C1E1B3D9CD2608716273756A756A706F4C667A00"), - (InnerPartitionKeyValue::String(thousand_a), "05C1EB5921F706086262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626200"), + ( + InnerPartitionKeyValue::String(String::from("")), + "05C1CF33970FF80800", + ), + ( + InnerPartitionKeyValue::String(String::from("partitionKey")), + "05C1E1B3D9CD2608716273756A756A706F4C667A00", + ), (InnerPartitionKeyValue::Null, "05C1ED45D7475601"), - (InnerPartitionKeyValue::Undefined, "05C1D529E345DC00"), (InnerPartitionKeyValue::Bool(true), "05C1D7C5A903D803"), - (InnerPartitionKeyValue::Bool(false), "05C1DB857D857C02"), - (InnerPartitionKeyValue::Number(-128f64), "05C1D73349F54C053FA0"), - (InnerPartitionKeyValue::Number(127f64), "05C1DD539DDFCC05C05FE0"), - (InnerPartitionKeyValue::Number(i64::MIN as f64), "05C1DB35F33D1C053C20"), - (InnerPartitionKeyValue::Number(i64::MAX as f64), "05C1B799AB2DD005C3E0"), - (InnerPartitionKeyValue::Number(i32::MIN as f64), "05C1DFBF252BCC053E20"), - (InnerPartitionKeyValue::Number(i32::MAX as f64), "05C1E1F503DFB205C1DFFFFFFFFC"), - (InnerPartitionKeyValue::Number(f64::from_bits(0x1)), "05C1E5C91F4D3005800101010101010102"), // Java Double.MIN_VALUE - (InnerPartitionKeyValue::Number(f64::MAX), "05C1CBE367C53005FFEFFFFFFFFFFFFFFE"), ]; for (component, expected) in &cases { let actual = get_hashed_partition_key_string(&[component], PartitionKeyKind::Hash, 1); - assert_eq!( - actual.as_str(), - *expected, - "Mismatch for V1 component hash (enable test after implementation)" - ); - // unspecified version defaults to V1 - let actual = get_hashed_partition_key_string(&[component], PartitionKeyKind::Hash, 1); - assert_eq!( - actual.as_str(), - *expected, - "Mismatch for V1 component hash (enable test after implementation)" - ); + assert_eq!(actual.as_str(), *expected, "Mismatch for V1 component hash"); } } } diff --git a/sdk/cosmos/azure_data_cosmos/src/lib.rs b/sdk/cosmos/azure_data_cosmos/src/lib.rs index f1cb2a10aa8..9a3ef270532 100644 --- a/sdk/cosmos/azure_data_cosmos/src/lib.rs +++ b/sdk/cosmos/azure_data_cosmos/src/lib.rs @@ -30,6 +30,8 @@ pub use clients::CosmosClientBuilder; pub use account_endpoint::CosmosAccountEndpoint; pub use account_reference::CosmosAccountReference; #[doc(inline)] +pub use hash::EffectivePartitionKey; +#[doc(inline)] pub use azure_data_cosmos_driver::models::FeedRange; pub use clients::ThroughputPoller; pub use connection_string::*; @@ -52,7 +54,6 @@ mod driver_bridge; pub mod fault_injection; mod handler; mod hash; -mod murmur_hash; mod operation_context; mod region_proximity; pub mod regions; From 4080a01cedfaec9f37f39a000def0eec82361b25 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 17:24:28 +0000 Subject: [PATCH 24/29] Move PartitionKey and PartitionKeyValue to driver --- .../src/clients/container_client.rs | 16 +- sdk/cosmos/azure_data_cosmos/src/hash.rs | 82 +-- sdk/cosmos/azure_data_cosmos/src/lib.rs | 10 +- .../azure_data_cosmos/src/partition_key.rs | 622 ------------------ .../src/retry_policies/client_retry_policy.rs | 2 +- .../metadata_request_retry_policy.rs | 2 +- .../src/routing/global_endpoint_manager.rs | 2 +- .../src/models/partition_key.rs | 38 +- 8 files changed, 80 insertions(+), 694 deletions(-) delete mode 100644 sdk/cosmos/azure_data_cosmos/src/partition_key.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 59186ad3997..99c92dc0a39 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -315,7 +315,7 @@ impl ContainerClient { // Build the driver's item reference from our stored container metadata. let item_ref = ItemReference::from_name( &self.container_ref, - partition_key.into().into_driver_partition_key(), + partition_key.into(), item_id.to_owned(), ); @@ -413,7 +413,7 @@ impl ContainerClient { // Build the driver's item reference from our stored container metadata. let item_ref = ItemReference::from_name( &self.container_ref, - partition_key.into().into_driver_partition_key(), + partition_key.into(), item_id.to_owned(), ); @@ -515,7 +515,7 @@ impl ContainerClient { // Build the driver's item reference from our stored container metadata. let item_ref = ItemReference::from_name( &self.container_ref, - partition_key.into().into_driver_partition_key(), + partition_key.into(), item_id.to_owned(), ); @@ -575,7 +575,7 @@ impl ContainerClient { // Build the driver's item reference from our stored container metadata. let item_ref = ItemReference::from_name( &self.container_ref, - partition_key.into().into_driver_partition_key(), + partition_key.into(), item_id.to_owned(), ); @@ -627,7 +627,7 @@ impl ContainerClient { // Build the driver's item reference from our stored container metadata. let item_ref = ItemReference::from_name( &self.container_ref, - partition_key.into().into_driver_partition_key(), + partition_key.into(), item_id.to_owned(), ); @@ -717,7 +717,7 @@ impl ContainerClient { let partition_key: PartitionKey = partition_key.into(); let query = query.into(); - let driver_pk = partition_key.into_driver_partition_key(); + let driver_pk = partition_key; let container_ref = self.container_ref.clone(); // The first operation to execute in the query items flow. @@ -791,7 +791,7 @@ impl ContainerClient { ) -> azure_core::Result { let options = options.unwrap_or_default(); let body = serde_json::to_vec(batch.operations())?; - let driver_pk = batch.partition_key().clone().into_driver_partition_key(); + let driver_pk = batch.partition_key().clone(); let operation = CosmosOperation::batch(self.container_ref.clone(), driver_pk).with_body(body); @@ -863,7 +863,7 @@ impl ContainerClient { options: Option, ) -> azure_core::Result> { let partition_key = partition_key.into(); - let driver_pk = partition_key.into_driver_partition_key(); + let driver_pk = partition_key; let options = options.unwrap_or_default(); let pk_def = self.container_ref.partition_key_definition(); let values = driver_pk.values(); diff --git a/sdk/cosmos/azure_data_cosmos/src/hash.rs b/sdk/cosmos/azure_data_cosmos/src/hash.rs index 16dd6ab6f3f..0bcc7d95740 100644 --- a/sdk/cosmos/azure_data_cosmos/src/hash.rs +++ b/sdk/cosmos/azure_data_cosmos/src/hash.rs @@ -46,36 +46,12 @@ impl From<&str> for EffectivePartitionKey { } } -/// Internal representation used by SDK partition key APIs. -#[derive(Clone, Debug, PartialEq)] -pub enum InnerPartitionKeyValue { - Null, - Bool(bool), - Number(f64), - String(String), - Infinity, - Undefined, -} - -impl Eq for InnerPartitionKeyValue {} - -fn to_driver_partition_key_value(value: &InnerPartitionKeyValue) -> DriverPartitionKeyValue { - match value { - InnerPartitionKeyValue::Null => DriverPartitionKeyValue::from(Option::::None), - InnerPartitionKeyValue::Bool(b) => DriverPartitionKeyValue::from(*b), - InnerPartitionKeyValue::Number(n) => DriverPartitionKeyValue::from(*n), - InnerPartitionKeyValue::String(s) => DriverPartitionKeyValue::from(s.clone()), - InnerPartitionKeyValue::Undefined => DriverPartitionKeyValue::undefined(), - InnerPartitionKeyValue::Infinity => DriverPartitionKeyValue::from(Option::::None), - } -} - /// Returns an [`EffectivePartitionKey`] representing the hashed partition key. /// /// Versions 1 and 2 map directly to the driver's partition key version enum. /// Any other version falls back to V2 for forward-compatible behavior. pub fn get_hashed_partition_key_string( - pk_value: &[&InnerPartitionKeyValue], + pk_value: &[DriverPartitionKeyValue], kind: PartitionKeyKind, version: u8, ) -> EffectivePartitionKey { @@ -83,15 +59,6 @@ pub fn get_hashed_partition_key_string( return EffectivePartitionKey(DriverEffectivePartitionKey::min()); } - if pk_value.len() == 1 && *pk_value[0] == InnerPartitionKeyValue::Infinity { - return EffectivePartitionKey(DriverEffectivePartitionKey::max()); - } - - let driver_values: Vec = pk_value - .iter() - .map(|value| to_driver_partition_key_value(value)) - .collect(); - let version = match version { 1 => PartitionKeyVersion::V1, 2 => PartitionKeyVersion::V2, @@ -104,7 +71,9 @@ pub fn get_hashed_partition_key_string( } }; - EffectivePartitionKey(DriverEffectivePartitionKey::compute(&driver_values, kind, version)) + EffectivePartitionKey(DriverEffectivePartitionKey::compute( + pk_value, kind, version, + )) } #[cfg(test)] @@ -117,64 +86,65 @@ mod tests { assert_eq!(result.as_str(), ""); } - #[test] - fn infinity_pk_returns_max() { - let inf = InnerPartitionKeyValue::Infinity; - let result = get_hashed_partition_key_string(&[&inf], PartitionKeyKind::Hash, 0); - assert_eq!(result.as_str(), "FF"); - } - #[test] fn single_string_hash_v2_matches_baseline() { - let comp = InnerPartitionKeyValue::String("customer42".to_string()); - let result = get_hashed_partition_key_string(&[&comp], PartitionKeyKind::Hash, 2); + let comp = DriverPartitionKeyValue::from("customer42".to_string()); + let result = get_hashed_partition_key_string(&[comp], PartitionKeyKind::Hash, 2); assert_eq!(result.as_str(), "19819C94CE42A1654CCC8110539D9589"); } #[test] fn effective_partition_key_hash_v2_examples() { - let cases: Vec<(InnerPartitionKeyValue, &str)> = vec![ + let cases: Vec<(DriverPartitionKeyValue, &str)> = vec![ ( - InnerPartitionKeyValue::String(String::from("")), + DriverPartitionKeyValue::from(String::from("")), "32E9366E637A71B4E710384B2F4970A0", ), ( - InnerPartitionKeyValue::String(String::from("partitionKey")), + DriverPartitionKeyValue::from(String::from("partitionKey")), "013AEFCF77FA271571CF665A58C933F1", ), ( - InnerPartitionKeyValue::Number(5.0), + DriverPartitionKeyValue::from(5.0), "19C08621B135968252FB34B4CF66F811", ), ( - InnerPartitionKeyValue::String(String::from("redmond")), + DriverPartitionKeyValue::from(String::from("redmond")), "22E342F38A486A088463DFF7838A5963", ), ]; for (component, expected) in &cases { - let actual = get_hashed_partition_key_string(&[component], PartitionKeyKind::Hash, 2); + let actual = get_hashed_partition_key_string( + std::slice::from_ref(component), + PartitionKeyKind::Hash, + 2, + ); assert_eq!(actual.as_str(), *expected, "Mismatch for V2 component hash"); } } #[test] fn effective_partition_key_hash_v1_examples() { - let cases: Vec<(InnerPartitionKeyValue, &str)> = vec![ + let cases: Vec<(DriverPartitionKeyValue, &str)> = vec![ ( - InnerPartitionKeyValue::String(String::from("")), + DriverPartitionKeyValue::from(String::from("")), "05C1CF33970FF80800", ), ( - InnerPartitionKeyValue::String(String::from("partitionKey")), + DriverPartitionKeyValue::from(String::from("partitionKey")), "05C1E1B3D9CD2608716273756A756A706F4C667A00", ), - (InnerPartitionKeyValue::Null, "05C1ED45D7475601"), - (InnerPartitionKeyValue::Bool(true), "05C1D7C5A903D803"), + (DriverPartitionKeyValue::NULL, "05C1ED45D7475601"), + (DriverPartitionKeyValue::from(true), "05C1D7C5A903D803"), ]; for (component, expected) in &cases { - let actual = get_hashed_partition_key_string(&[component], PartitionKeyKind::Hash, 1); + let actual = get_hashed_partition_key_string( + std::slice::from_ref(component), + PartitionKeyKind::Hash, + 1, + ); assert_eq!(actual.as_str(), *expected, "Mismatch for V1 component hash"); } } diff --git a/sdk/cosmos/azure_data_cosmos/src/lib.rs b/sdk/cosmos/azure_data_cosmos/src/lib.rs index 9a3ef270532..33af05d6edc 100644 --- a/sdk/cosmos/azure_data_cosmos/src/lib.rs +++ b/sdk/cosmos/azure_data_cosmos/src/lib.rs @@ -12,7 +12,6 @@ pub mod constants; mod credential; mod feed; pub mod options; -mod partition_key; pub(crate) mod pipeline; pub mod query; pub(crate) mod resource_context; @@ -30,15 +29,18 @@ pub use clients::CosmosClientBuilder; pub use account_endpoint::CosmosAccountEndpoint; pub use account_reference::CosmosAccountReference; #[doc(inline)] -pub use hash::EffectivePartitionKey; -#[doc(inline)] pub use azure_data_cosmos_driver::models::FeedRange; +#[doc(inline)] +pub use azure_data_cosmos_driver::models::PartitionKey; +#[doc(inline)] +pub use azure_data_cosmos_driver::models::PartitionKeyValue; pub use clients::ThroughputPoller; pub use connection_string::*; pub use credential::CosmosCredential; +#[doc(inline)] +pub use hash::EffectivePartitionKey; pub use models::{BatchResponse, CosmosDiagnostics, ItemResponse, ResourceResponse}; pub use options::*; -pub use partition_key::*; pub use query::Query; pub use routing_strategy::RoutingStrategy; pub use transactional_batch::{ diff --git a/sdk/cosmos/azure_data_cosmos/src/partition_key.rs b/sdk/cosmos/azure_data_cosmos/src/partition_key.rs deleted file mode 100644 index eaebd3ac344..00000000000 --- a/sdk/cosmos/azure_data_cosmos/src/partition_key.rs +++ /dev/null @@ -1,622 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -use std::borrow::Cow; - -use azure_core::http::headers::{AsHeaders, HeaderName, HeaderValue}; - -use crate::constants; -use crate::hash::{get_hashed_partition_key_string, EffectivePartitionKey, InnerPartitionKeyValue}; -use crate::models::PartitionKeyKind; - -/// Specifies a partition key value, usually used when querying a specific partition. -/// -/// # Specifying a partition key -/// -/// Most APIs that require a partition key will accept `impl Into`, giving you a few options on how to specify your partition key. -/// -/// A single, non-hierarchical, partition key can be specified using the underlying type itself: -/// -/// ```rust,no_run -/// # use azure_data_cosmos::clients::ContainerClient; -/// # let container_client: ContainerClient = panic!("this is a non-running example"); -/// container_client.query_items::( -/// "SELECT * FROM c", -/// "a single string partition key", -/// None).unwrap(); -/// container_client.query_items::( -/// "SELECT * FROM c", -/// 42, // A numeric partition key -/// None).unwrap(); -/// ``` -/// -/// Hierarchical partition keys can be specified using tuples: -/// -/// ```rust,no_run -/// # use azure_data_cosmos::clients::ContainerClient; -/// # let container_client: ContainerClient = panic!("this is a non-running example"); -/// container_client.query_items::( -/// "SELECT * FROM c", -/// ("parent", "child"), -/// None).unwrap(); -/// ``` -/// -/// Null values can be represented in one of two ways. -/// First, you can use the value [`PartitionKey::NULL`]: -/// -/// ```rust,no_run -/// # use azure_data_cosmos::{clients::ContainerClient, PartitionKey}; -/// # let container_client: ContainerClient = panic!("this is a non-running example"); -/// container_client.query_items::( -/// "SELECT * FROM c", -/// PartitionKey::NULL, -/// None).unwrap(); -/// container_client.query_items::( -/// "SELECT * FROM c", -/// ("a", PartitionKey::NULL, "b"), // A null value within a hierarchical partition key. -/// None).unwrap(); -/// ``` -/// -/// Undefined partition key values can be represented using [`PartitionKey::UNDEFINED`]. -/// This is used to refer to items where the partition key property is absent from the document. -/// This is distinct from `null` (where the property exists but has a JSON null value). -/// -/// ```rust,no_run -/// # use azure_data_cosmos::{clients::ContainerClient, PartitionKey}; -/// # let container_client: ContainerClient = panic!("this is a non-running example"); -/// # async { -/// container_client.read_item::( -/// PartitionKey::UNDEFINED, -/// "item_without_partition_key_property", -/// None).await.unwrap(); -/// # }; -/// ``` -/// -/// Or, if you have an [`Option`], for some `T` that is valid as a partition key, it will automatically be serialized as `null` if it has the value [`Option::None`]: -/// -/// ```rust,no_run -/// # use azure_data_cosmos::clients::ContainerClient; -/// # let container_client: ContainerClient = panic!("this is a non-running example"); -/// let my_partition_key: Option = None; -/// container_client.query_items::( -/// "SELECT * FROM c", -/// my_partition_key, -/// None).unwrap(); -/// ``` -/// -/// If you want to create your [`PartitionKey`] and store it in a variable, use [`PartitionKey::from()`] -/// -/// ```rust -/// # use azure_data_cosmos::PartitionKey; -/// let partition_key_1 = PartitionKey::from("simple_string"); -/// let partition_key_2 = PartitionKey::from(("parent", "child", 42)); -/// ``` -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PartitionKey(Vec); - -impl PartitionKey { - /// A single null partition key value, which can be used as the sole partition key or as part of a hierarchical partition key. - pub const NULL: PartitionKeyValue = PartitionKeyValue(InnerPartitionKeyValue::Null); - - /// A single undefined partition key value, used to target items where the partition key property is absent from the document. - /// - /// This is distinct from [`PartitionKey::NULL`], which targets items where the partition key property exists but has a JSON `null` value. - /// An undefined value is serialized as `{}` (an empty JSON object) in the partition key header. - /// For example, a single `UNDEFINED` value serializes to `[{}]`. - pub const UNDEFINED: PartitionKeyValue = PartitionKeyValue(InnerPartitionKeyValue::Undefined); - - /// An empty list of partition key values, which is used to signal a cross-partition query, when querying a container. - pub const EMPTY: PartitionKey = PartitionKey(Vec::new()); - - #[allow(dead_code)] - pub(crate) fn is_empty(&self) -> bool { - self.0.is_empty() - } - - /// Converts this SDK partition key into the driver's equivalent type. - pub(crate) fn into_driver_partition_key( - self, - ) -> azure_data_cosmos_driver::models::PartitionKey { - use azure_data_cosmos_driver::models::{ - PartitionKey as DriverPK, PartitionKeyValue as DriverPKV, - }; - - let driver_values: Vec = self - .0 - .into_iter() - .map(|v| match v.0 { - InnerPartitionKeyValue::String(s) => DriverPKV::from(s), - InnerPartitionKeyValue::Number(n) => DriverPKV::from(n), - InnerPartitionKeyValue::Bool(b) => DriverPKV::from(b), - InnerPartitionKeyValue::Null => DriverPKV::from(Option::::None), - InnerPartitionKeyValue::Undefined => DriverPKV::undefined(), - InnerPartitionKeyValue::Infinity => { - // Infinity is an internal sentinel for EPK boundary calculations - // and cannot be constructed via the public SDK API. - // Mapping to Null as a defensive fallback; this path should be unreachable. - DriverPKV::from(Option::::None) - } - }) - .collect(); - - DriverPK::from(driver_values) - } - - /// Returns a hex string representation of the partition key hash. - /// - /// # Arguments - /// * `kind` - The partition key kind (Hash or MultiHash) - /// * `version` - The hash version (1 or 2) - /// - /// # Returns - /// An `EffectivePartitionKey` representing the hashed partition key - pub fn get_hashed_partition_key_string( - &self, - kind: PartitionKeyKind, - version: u8, - ) -> EffectivePartitionKey { - let inner_values: Vec<&InnerPartitionKeyValue> = self.0.iter().map(|v| &v.0).collect(); - get_hashed_partition_key_string(&inner_values, kind, version) - } -} - -impl AsHeaders for PartitionKey { - type Error = azure_core::Error; - type Iter = std::iter::Once<(HeaderName, HeaderValue)>; - - fn as_headers(&self) -> Result { - // We have to do some manual JSON serialization here. - // The partition key is sent in an HTTP header, when used to set the partition key for a query. - // It's not safe to use non-ASCII characters in HTTP headers, and serde_json will not escape non-ASCII characters if they are otherwise valid as UTF-8. - // So, we do some conversion by hand, with the help of Rust's own `encode_utf16` method which gives us the necessary code points for non-ASCII values, and produces surrogate pairs as needed. - - // Quick shortcut for empty partition keys list, which also prevents a bug when we pop the trailing comma for an empty list. - if self.0.is_empty() { - // An empty partition key means a cross partition query - return Ok(std::iter::once(( - constants::QUERY_ENABLE_CROSS_PARTITION, - HeaderValue::from_static("True"), - ))); - } - - let mut json = String::new(); - let mut utf_buf = [0; 2]; // A buffer for encoding UTF-16 characters. - json.push('['); - for key in &self.0 { - match key.0 { - InnerPartitionKeyValue::Undefined => json.push_str("{}"), - InnerPartitionKeyValue::Null => json.push_str("null"), - InnerPartitionKeyValue::Bool(b) => json.push_str(if b { "true" } else { "false" }), - InnerPartitionKeyValue::String(ref string_key) => { - json.push('"'); - for char in string_key.chars() { - match char { - '\x08' => json.push_str(r#"\b"#), - '\x0c' => json.push_str(r#"\f"#), - '\n' => json.push_str(r#"\n"#), - '\r' => json.push_str(r#"\r"#), - '\t' => json.push_str(r#"\t"#), - '"' => json.push_str(r#"\""#), - '\\' => json.push('\\'), - c if c.is_ascii() => json.push(c), - c => { - let encoded = c.encode_utf16(&mut utf_buf); - for code_unit in encoded { - json.push_str(&format!(r#"\u{:04x}"#, code_unit)); - } - } - } - } - json.push('"'); - } - InnerPartitionKeyValue::Number(ref num) => { - json.push_str(&num.to_string()); - } - InnerPartitionKeyValue::Infinity => json.push_str("\"Infinity\""), - } - - json.push(','); - } - - // Pop the trailing ',' (only if we actually wrote any values) - if json.ends_with(',') { - json.pop(); - } - json.push(']'); - - Ok(std::iter::once(( - constants::PARTITION_KEY, - HeaderValue::from_cow(json), - ))) - } -} - -/// Represents a value for a single partition key. -/// -/// You shouldn't need to construct this type directly. The various implementations of [`Into`] will handle it for you. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PartitionKeyValue(InnerPartitionKeyValue); - -impl From for PartitionKeyValue { - fn from(value: InnerPartitionKeyValue) -> Self { - PartitionKeyValue(value) - } -} - -impl From<&'static str> for PartitionKeyValue { - fn from(value: &'static str) -> Self { - InnerPartitionKeyValue::String(value.to_string()).into() - } -} - -impl From for PartitionKeyValue { - fn from(value: String) -> Self { - InnerPartitionKeyValue::String(value).into() - } -} - -impl From<&String> for PartitionKeyValue { - fn from(value: &String) -> Self { - InnerPartitionKeyValue::String(value.clone()).into() - } -} - -impl From> for PartitionKeyValue { - fn from(value: Cow<'static, str>) -> Self { - InnerPartitionKeyValue::String(value.into_owned()).into() - } -} - -macro_rules! impl_from_number { - ($source_type: ty) => { - impl From<$source_type> for PartitionKeyValue { - fn from(value: $source_type) -> Self { - InnerPartitionKeyValue::Number(value as f64).into() - } - } - }; -} - -impl_from_number!(i16); -impl_from_number!(i32); -impl_from_number!(i64); -impl_from_number!(i8); -impl_from_number!(isize); -impl_from_number!(u16); -impl_from_number!(u32); -impl_from_number!(u64); -impl_from_number!(u8); -impl_from_number!(usize); - -impl From for PartitionKeyValue { - /// Creates a [`PartitionKeyValue`] from an `f32`. - /// - /// WARNING: This extends the precision of the value from `f32` to `f64`. - /// - /// # Panics - /// - /// This method panics if given an Infinite or NaN value. - fn from(value: f32) -> Self { - assert!( - !value.is_infinite() && !value.is_nan(), - "value should be a non-infinite number" - ); - InnerPartitionKeyValue::Number(value as f64).into() - } -} - -impl From for PartitionKeyValue { - /// Creates a [`PartitionKeyValue`] from an `f64`. - /// - /// # Panics - /// - /// This method panics if given an Infinite or NaN value. - fn from(value: f64) -> Self { - assert!( - !value.is_infinite() && !value.is_nan(), - "value should be a non-infinite number" - ); - InnerPartitionKeyValue::Number(value).into() - } -} - -impl> From> for PartitionKeyValue { - fn from(value: Option) -> Self { - match value { - Some(t) => t.into(), - None => InnerPartitionKeyValue::Null.into(), - } - } -} - -impl From<()> for PartitionKey { - fn from(_: ()) -> Self { - PartitionKey::EMPTY - } -} - -impl From> for PartitionKey { - /// Creates a [`PartitionKey`] from a vector of [`PartitionKeyValue`]s. - /// - /// This is useful when the partition key structure is determined at runtime, - /// such as when working with multiple containers with different schemas or - /// building partition keys from configuration. - /// - /// # Panics - /// - /// Panics if the vector contains more than 3 elements, as Cosmos DB supports - /// a maximum of 3 hierarchical partition key levels. - /// - /// # Examples - /// - /// ```rust - /// use azure_data_cosmos::{PartitionKey, PartitionKeyValue}; - /// - /// // Single-level partition key - /// let keys = vec![PartitionKeyValue::from("tenant1")]; - /// let partition_key = PartitionKey::from(keys); - /// - /// // Multi-level partition key built at runtime - /// let mut keys = vec![PartitionKeyValue::from("tenant1")]; - /// keys.push(PartitionKeyValue::from("region1")); - /// let partition_key = PartitionKey::from(keys); - /// ``` - fn from(values: Vec) -> Self { - assert!( - values.len() <= 3, - "Partition keys can have at most 3 levels, got {}", - values.len() - ); - PartitionKey(values) - } -} - -impl> From for PartitionKey { - fn from(value: T) -> Self { - PartitionKey(vec![value.into()]) - } -} - -macro_rules! impl_from_tuple { - ($($n:tt $name:ident)*) => { - impl<$($name: Into),*> From<($($name,)*)> for PartitionKey { - fn from(value: ($($name,)*)) -> Self { - PartitionKey(vec![$( - value.$n.into() - ),*]) - } - } - }; -} - -// CosmosDB hierarchical partition keys are up to 3 levels: -// https://learn.microsoft.com/en-us/azure/cosmos-db/hierarchical-partition-keys -impl_from_tuple!(0 A 1 B); -impl_from_tuple!(0 A 1 B 2 C); - -#[cfg(test)] -mod tests { - use crate::{constants, PartitionKey, PartitionKeyValue}; - use azure_core::http::headers::AsHeaders; - - fn key_to_string(v: impl Into) -> String { - let key = v.into(); - let mut headers_iter = key.as_headers().unwrap(); - let (name, value) = headers_iter.next().unwrap(); - assert_eq!(constants::PARTITION_KEY, name); - value.as_str().into() - } - - /// Validates that a given value is `impl Into` and works as-expected. - fn key_to_single_string_partition_key(v: Option>) -> Option { - v.map(|k| key_to_string(k)) - } - - #[test] - pub fn static_str() { - assert_eq!(key_to_string("my_partition_key"), r#"["my_partition_key"]"#); - assert_eq!( - key_to_single_string_partition_key(Some("my_partition_key")).as_deref(), - Some(r#"["my_partition_key"]"#) - ); - } - - #[test] - pub fn integers() { - assert_eq!(key_to_string(42u8), r#"[42]"#); - assert_eq!(key_to_string(42u16), r#"[42]"#); - assert_eq!(key_to_string(42u32), r#"[42]"#); - assert_eq!(key_to_string(42u64), r#"[42]"#); - assert_eq!(key_to_string(42usize), r#"[42]"#); - assert_eq!(key_to_string(42i8), r#"[42]"#); - assert_eq!(key_to_string(42i16), r#"[42]"#); - assert_eq!(key_to_string(42i32), r#"[42]"#); - assert_eq!(key_to_string(42i64), r#"[42]"#); - assert_eq!(key_to_string(42isize), r#"[42]"#); - } - - #[test] - pub fn floats() { - // The f32 gets up-cast to f64, which results in a rounding issue. - // It's serde_json's default behavior, so we expect it, even if it isn't ideal. - assert_eq!(key_to_string(4.2f32), r#"[4.199999809265137]"#); - assert_eq!(key_to_string(4.2f64), r#"[4.2]"#); - } - - #[test] - pub fn options() { - let some: Option<&str> = Some("my_partition_key"); - let none: Option<&str> = None; - assert_eq!(key_to_string(some), r#"["my_partition_key"]"#); - assert_eq!(key_to_string(none), r#"[null]"#); - } - - #[test] - fn from_vec_empty() { - let keys: Vec = vec![]; - let partition_key = PartitionKey::from(keys); - assert_eq!(Vec::::new(), partition_key.0); - - let mut headers_iter = partition_key.as_headers().unwrap(); - let (name, value) = headers_iter.next().unwrap(); - assert_eq!(constants::QUERY_ENABLE_CROSS_PARTITION, name); - assert_eq!("True", value.as_str()); - } - - #[test] - fn from_vec_single() { - let keys = vec![PartitionKeyValue::from("tenant1")]; - let partition_key = PartitionKey::from(keys); - assert_eq!(key_to_string(partition_key), r#"["tenant1"]"#); - } - - #[test] - fn from_vec_double() { - let keys = vec![ - PartitionKeyValue::from("tenant1"), - PartitionKeyValue::from("region1"), - ]; - let partition_key = PartitionKey::from(keys); - assert_eq!(key_to_string(partition_key), r#"["tenant1","region1"]"#); - } - - #[test] - fn from_vec_triple() { - let keys = vec![ - PartitionKeyValue::from("tenant1"), - PartitionKeyValue::from("region1"), - PartitionKeyValue::from("user1"), - ]; - let partition_key = PartitionKey::from(keys); - assert_eq!( - key_to_string(partition_key), - r#"["tenant1","region1","user1"]"# - ); - } - - #[test] - fn from_vec_mixed_types() { - let keys = vec![ - PartitionKeyValue::from("tenant1"), - PartitionKeyValue::from(42i64), - PartitionKeyValue::from(123.45f64), - ]; - let partition_key = PartitionKey::from(keys); - assert_eq!(key_to_string(partition_key), r#"["tenant1",42,123.45]"#); - } - - #[test] - #[should_panic(expected = "Partition keys can have at most 3 levels, got 4")] - fn from_vec_too_many() { - let keys = vec![ - PartitionKeyValue::from("a"), - PartitionKeyValue::from("b"), - PartitionKeyValue::from("c"), - PartitionKeyValue::from("d"), - ]; - let _partition_key = PartitionKey::from(keys); - } - - #[test] - fn null_value() { - assert_eq!(key_to_string(PartitionKey::NULL), r#"[null]"#); - assert_eq!( - key_to_string((PartitionKey::NULL, PartitionKey::NULL, PartitionKey::NULL)), - r#"[null,null,null]"# - ); - } - - #[test] - pub fn non_ascii_string() { - let key = PartitionKey::from("smile 😀"); - assert_eq!(key_to_string(key), r#"["smile \ud83d\ude00"]"#); - } - - #[test] - pub fn tuple() { - assert_eq!( - key_to_string((42u8, "my_partition_key", PartitionKey::NULL)), - r#"[42,"my_partition_key",null]"# - ); - } - - #[test] - pub fn empty() { - let partition_key = PartitionKey::from(()); - assert_eq!(Vec::::new(), partition_key.0); - - let mut headers_iter = partition_key.as_headers().unwrap(); - let (name, value) = headers_iter.next().unwrap(); - assert_eq!(constants::QUERY_ENABLE_CROSS_PARTITION, name); - assert_eq!("True", value.as_str()); - } - - /// Helper to get the partition key header value (not cross-partition header). - fn key_to_pk_header(v: impl Into) -> (String, String) { - let key = v.into(); - let mut headers_iter = key.as_headers().unwrap(); - let (name, value) = headers_iter.next().unwrap(); - (name.as_str().to_string(), value.as_str().to_string()) - } - - #[test] - fn undefined_single() { - // A single UNDEFINED value should produce [{}] via the partition key header, - // where {} is the wire representation of an undefined partition key component. - let (name, value) = key_to_pk_header(PartitionKey::UNDEFINED); - assert_eq!(constants::PARTITION_KEY.as_str(), name); - assert_eq!("[{}]", value); - } - - #[test] - fn undefined_all_in_hierarchical() { - // All UNDEFINED values in a hierarchical key should produce [{},{}]. - let (name, value) = key_to_pk_header((PartitionKey::UNDEFINED, PartitionKey::UNDEFINED)); - assert_eq!(constants::PARTITION_KEY.as_str(), name); - assert_eq!("[{},{}]", value); - } - - #[test] - fn undefined_mixed_with_values() { - // UNDEFINED values should be serialized as {} in the JSON array. - assert_eq!( - key_to_string(("parent", PartitionKey::UNDEFINED)), - r#"["parent",{}]"# - ); - assert_eq!( - key_to_string((PartitionKey::UNDEFINED, "child")), - r#"[{},"child"]"# - ); - } - - #[test] - fn undefined_distinct_from_null() { - // UNDEFINED produces [{}] while NULL produces [null]. - let (undef_name, undef_value) = key_to_pk_header(PartitionKey::UNDEFINED); - let null_value = key_to_string(PartitionKey::NULL); - assert_eq!(constants::PARTITION_KEY.as_str(), undef_name); - assert_eq!("[{}]", undef_value); - assert_eq!("[null]", null_value); - } - - #[test] - fn undefined_distinct_from_empty() { - // UNDEFINED sends the partition key header with `[{}]`, while EMPTY sends the cross-partition header. - let (undef_name, undef_value) = key_to_pk_header(PartitionKey::UNDEFINED); - assert_eq!(constants::PARTITION_KEY.as_str(), undef_name); - assert_eq!("[{}]", undef_value); - - let empty = PartitionKey::EMPTY; - let mut headers_iter = empty.as_headers().unwrap(); - let (empty_name, empty_value) = headers_iter.next().unwrap(); - assert_eq!(constants::QUERY_ENABLE_CROSS_PARTITION, empty_name); - assert_eq!("True", empty_value.as_str()); - } - - #[test] - fn undefined_in_vec() { - let keys = vec![PartitionKeyValue::from("tenant1"), PartitionKey::UNDEFINED]; - let partition_key = PartitionKey::from(keys); - assert_eq!(key_to_string(partition_key), r#"["tenant1",{}]"#); - } -} diff --git a/sdk/cosmos/azure_data_cosmos/src/retry_policies/client_retry_policy.rs b/sdk/cosmos/azure_data_cosmos/src/retry_policies/client_retry_policy.rs index 9d77656b7ea..0c82ba77a9e 100644 --- a/sdk/cosmos/azure_data_cosmos/src/retry_policies/client_retry_policy.rs +++ b/sdk/cosmos/azure_data_cosmos/src/retry_policies/client_retry_policy.rs @@ -741,11 +741,11 @@ mod tests { use super::*; use crate::models::AccountRegion; use crate::operation_context::OperationType; - use crate::partition_key::PartitionKey; use crate::regions::Region; use crate::resource_context::{ResourceLink, ResourceType}; use crate::routing::global_endpoint_manager::GlobalEndpointManager; use crate::routing::partition_key_range::PartitionKeyRange; + use crate::PartitionKey; use azure_core::http::headers::Headers; use azure_core::http::ClientOptions; use azure_core::Bytes; diff --git a/sdk/cosmos/azure_data_cosmos/src/retry_policies/metadata_request_retry_policy.rs b/sdk/cosmos/azure_data_cosmos/src/retry_policies/metadata_request_retry_policy.rs index a663a9e34fd..34ce2c65a58 100644 --- a/sdk/cosmos/azure_data_cosmos/src/retry_policies/metadata_request_retry_policy.rs +++ b/sdk/cosmos/azure_data_cosmos/src/retry_policies/metadata_request_retry_policy.rs @@ -267,10 +267,10 @@ mod tests { use super::*; use crate::operation_context::OperationType; use crate::options::ExcludedRegions; - use crate::partition_key::PartitionKey; use crate::regions::Region; use crate::resource_context::{ResourceLink, ResourceType}; use crate::routing::global_endpoint_manager::GlobalEndpointManager; + use crate::PartitionKey; use azure_core::http::headers::Headers; use azure_core::http::ClientOptions; use azure_core::Bytes; diff --git a/sdk/cosmos/azure_data_cosmos/src/routing/global_endpoint_manager.rs b/sdk/cosmos/azure_data_cosmos/src/routing/global_endpoint_manager.rs index 99a4131e79a..584cd900b20 100644 --- a/sdk/cosmos/azure_data_cosmos/src/routing/global_endpoint_manager.rs +++ b/sdk/cosmos/azure_data_cosmos/src/routing/global_endpoint_manager.rs @@ -556,7 +556,7 @@ impl GlobalEndpointManager { mod tests { use super::*; use crate::models::AccountRegion; - use crate::partition_key::PartitionKey; + use crate::PartitionKey; fn create_test_pipeline() -> Pipeline { Pipeline::new( diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/partition_key.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/partition_key.rs index f2c76e58d7b..800fa78e718 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/partition_key.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/partition_key.rs @@ -4,7 +4,10 @@ //! Partition key types for Cosmos DB operations. -use crate::models::FiniteF64; +use crate::models::{ + effective_partition_key::EffectivePartitionKey, FiniteF64, PartitionKeyKind, + PartitionKeyVersion, +}; use azure_core::http::headers::{AsHeaders, HeaderName, HeaderValue}; use std::{borrow::Cow, hash::Hash}; @@ -151,6 +154,12 @@ impl From for PartitionKeyValue { } impl PartitionKeyValue { + /// The Null partition key value. + pub const NULL: Self = Self(InnerPartitionKeyValue::Null); + + /// The Undefined partition key value. + pub const UNDEFINED: Self = Self(InnerPartitionKeyValue::Undefined); + /// Writes this value into a byte buffer using the V2 hashing encoding. /// /// Used by the effective partition key computation for MurmurHash3-128. @@ -300,6 +309,12 @@ impl Default for PartitionKey { } impl PartitionKey { + /// A single null partition key value. + pub const NULL: PartitionKeyValue = PartitionKeyValue::NULL; + + /// A single undefined partition key value. + pub const UNDEFINED: PartitionKeyValue = PartitionKeyValue::UNDEFINED; + /// An empty partition key, used to signal a cross-partition operation. pub const EMPTY: PartitionKey = PartitionKey(Vec::new()); @@ -322,6 +337,27 @@ impl PartitionKey { pub fn values(&self) -> &[PartitionKeyValue] { &self.0 } + + /// Returns a hex string representation of the partition key hash. + pub fn get_hashed_partition_key_string( + &self, + kind: PartitionKeyKind, + version: u8, + ) -> EffectivePartitionKey { + let version = match version { + 1 => PartitionKeyVersion::V1, + 2 => PartitionKeyVersion::V2, + unsupported => { + tracing::warn!( + "Partition key hashing version {} is unsupported in SDK API; defaulting to V2", + unsupported + ); + PartitionKeyVersion::V2 + } + }; + + EffectivePartitionKey::compute(&self.0, kind, version) + } } impl AsHeaders for PartitionKey { From d4574038e3318c71716684945b4998a7934c3df9 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 17:43:04 +0000 Subject: [PATCH 25/29] Add QueryScope parameter to query_items --- sdk/cosmos/azure_data_cosmos/CHANGELOG.md | 1 + .../examples/cosmos/query.rs | 17 ++++--- .../src/clients/container_client.rs | 30 +++++------ sdk/cosmos/azure_data_cosmos/src/query.rs | 42 +++++++++++++++ .../tests/emulator_tests/cosmos_query.rs | 32 ++++++++---- .../tests/framework/test_client.rs | 7 ++- .../src/driver/cosmos_driver.rs | 51 ++----------------- .../src/driver/pipeline/components.rs | 12 ++++- .../src/driver/pipeline/operation_pipeline.rs | 24 +++++---- .../driver/transport/transport_pipeline.rs | 11 ++++ .../src/models/feed_range.rs | 1 + .../src/models/mod.rs | 1 + 12 files changed, 136 insertions(+), 93 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos/CHANGELOG.md b/sdk/cosmos/azure_data_cosmos/CHANGELOG.md index abe4c416be7..807de125390 100644 --- a/sdk/cosmos/azure_data_cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure_data_cosmos/CHANGELOG.md @@ -7,6 +7,7 @@ ### Breaking Changes - `CosmosClientBuilder::with_user_agent_suffix` (and `CosmosClientOptions::with_user_agent_suffix`) now take `UserAgentSuffix` instead of `impl Into`. Callers passing a `&str` or `String` must construct the value explicitly via `UserAgentSuffix::new` (panics on invalid input) or `UserAgentSuffix::try_new` (returns `Option`). Validation rules (max 25 characters, HTTP-header-safe) are now enforced at the construction site instead of being applied silently inside the builder. ([#4368](https://github.com/Azure/azure-sdk-for-rust/pull/4368)) +- `ContainerClient::query_items()` now takes a `QueryScope` (`QueryScope::partition(...)`, `QueryScope::feed_range(...)`, or `QueryScope::full_container()`) instead of a partition key where `()` represented cross-partition queries. ### Bugs Fixed diff --git a/sdk/cosmos/azure_data_cosmos/examples/cosmos/query.rs b/sdk/cosmos/azure_data_cosmos/examples/cosmos/query.rs index 78acae5c1a2..786460a3191 100644 --- a/sdk/cosmos/azure_data_cosmos/examples/cosmos/query.rs +++ b/sdk/cosmos/azure_data_cosmos/examples/cosmos/query.rs @@ -3,7 +3,7 @@ use std::error::Error; -use azure_data_cosmos::{CosmosClient, PartitionKey}; +use azure_data_cosmos::{query::QueryScope, CosmosClient}; use clap::{Args, Subcommand}; use futures::TryStreamExt; @@ -55,13 +55,14 @@ impl QueryCommand { let db_client = client.database_client(&database); let container_client = db_client.container_client(&container).await?; - let pk = match partition_key { - Some(pk) => PartitionKey::from(pk), - None => PartitionKey::EMPTY, + let scope = match partition_key { + Some(pk) => QueryScope::partition(pk), + None => QueryScope::full_container(), }; - let mut items = - container_client.query_items::(&query, pk, None)?; + let mut items = container_client + .query_items::(&query, scope, None) + .await?; println!("Items:"); while let Some(item) = items.try_next().await? { @@ -70,7 +71,7 @@ impl QueryCommand { Ok(()) } Subcommands::Databases { query } => { - let mut dbs = client.query_databases(query, None)?; + let mut dbs = client.query_databases(query, None).await?; println!("Databases:"); while let Some(item) = dbs.try_next().await? { @@ -80,7 +81,7 @@ impl QueryCommand { } Subcommands::Containers { database, query } => { let db_client = client.database_client(&database); - let mut dbs = db_client.query_containers(query, None)?; + let mut dbs = db_client.query_containers(query, None).await?; println!("Containers:"); while let Some(item) = dbs.try_next().await? { diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 99c92dc0a39..9497cb35186 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -10,6 +10,7 @@ use crate::{ BatchOptions, Precondition, QueryOptions, ReadContainerOptions, ReadFeedRangesOptions, SessionToken, }, + query::QueryScope, transactional_batch::TransactionalBatch, DeleteContainerOptions, FeedItemIterator, FeedRange, ItemReadOptions, ItemWriteOptions, PartitionKey, Query, ReplaceContainerOptions, ThroughputOptions, @@ -17,7 +18,7 @@ use crate::{ use super::ThroughputPoller; use azure_data_cosmos_driver::models::{ - ContainerReference, CosmosOperation, ItemReference, OperationTarget, PartitionKeyKind, + ContainerReference, CosmosOperation, ItemReference, PartitionKeyKind, }; use azure_data_cosmos_driver::options::OperationOptions; use serde::{de::DeserializeOwned, Serialize}; @@ -660,7 +661,7 @@ impl ContainerClient { /// # Arguments /// /// * `query` - The query to execute. - /// * `partition_key` - The partition key to scope the query on, or specify an empty key (`()`) to perform a cross-partition query. + /// * `scope` - The [`QueryScope`] specifying the scope of the query. /// * `options` - Optional parameters for the request. /// /// # Cross Partition Queries @@ -671,11 +672,12 @@ impl ContainerClient { /// /// # Examples /// - /// The `query` and `partition_key` parameters accept anything that can be transformed [`Into`] their relevant types. + /// The `query` parameter accepts anything that can be transformed [`Into`] a [`Query`], and `scope` controls partition targeting. /// This allows simple queries without parameters to be expressed easily: /// /// ```rust,no_run /// # async fn doc() -> Result<(), Box> { + /// # use azure_data_cosmos::query::QueryScope; /// # let container_client: azure_data_cosmos::clients::ContainerClient = panic!("this is a non-running example"); /// #[derive(serde::Deserialize)] /// struct Customer { @@ -684,8 +686,9 @@ impl ContainerClient { /// } /// let items = container_client.query_items::( /// "SELECT * FROM c", - /// "some_partition_key", - /// None)?; + /// QueryScope::partition("some_partition_key"), + /// None, + /// )?; /// # } /// ``` /// @@ -693,7 +696,7 @@ impl ContainerClient { /// /// ```rust,no_run /// # async fn doc() -> Result<(), Box> { - /// use azure_data_cosmos::Query; + /// use azure_data_cosmos::{query::QueryScope, Query}; /// # let container_client: azure_data_cosmos::clients::ContainerClient = panic!("this is a non-running example"); /// #[derive(serde::Deserialize)] /// struct Customer { @@ -702,7 +705,8 @@ impl ContainerClient { /// } /// let query = Query::from("SELECT COUNT(*) FROM c WHERE c.customer_id = @customer_id") /// .with_parameter("@customer_id", 42)?; - /// let items = container_client.query_items::(query, "some_partition_key", None)?; + /// let items = container_client + /// .query_items::(query, QueryScope::partition("some_partition_key"), None)?; /// # } /// ``` /// @@ -710,23 +714,19 @@ impl ContainerClient { pub async fn query_items( &self, query: impl Into, - partition_key: impl Into, + scope: QueryScope, options: Option, ) -> azure_core::Result> { let options = options.unwrap_or_default(); - let partition_key: PartitionKey = partition_key.into(); let query = query.into(); - let driver_pk = partition_key; let container_ref = self.container_ref.clone(); // The first operation to execute in the query items flow. // This holds the session token provided by the user, if any. - let mut initial_operation = CosmosOperation::query_items( - container_ref.clone(), - OperationTarget::PartitionKey(driver_pk.clone()), - ) - .with_body(serde_json::to_vec(&query)?); + let mut initial_operation = + CosmosOperation::query_items(container_ref.clone(), scope.into()) + .with_body(serde_json::to_vec(&query)?); if let Some(token) = options.session_token { initial_operation = initial_operation.with_session_token(token); } diff --git a/sdk/cosmos/azure_data_cosmos/src/query.rs b/sdk/cosmos/azure_data_cosmos/src/query.rs index 9cf04d30d17..fe0bf812d8f 100644 --- a/sdk/cosmos/azure_data_cosmos/src/query.rs +++ b/sdk/cosmos/azure_data_cosmos/src/query.rs @@ -3,8 +3,50 @@ //! Models and components used to represents and execute queries. +use azure_data_cosmos_driver::models::{FeedRange, OperationTarget, PartitionKey}; use serde::Serialize; +/// Represents the scope of a query, which determines which partitions it targets. +/// +/// The Cosmos DB backend can only execute queries against a single physical partition at a time, +/// so it is important to choose the appropriate scope for your query to ensure it is executed efficiently. +/// Queries that cross physical partition boundaries require the client to fan out the query to +/// multiple partitions and aggregate the results, which can be expensive and slow for large datasets. +pub enum QueryScope { + Partition(PartitionKey), + FeedRange(FeedRange), +} + +impl QueryScope { + /// Returns a [`QueryScope`] that represents the given partition key, which is used for targeting a specific partition in the container. + pub fn partition(pk: impl Into) -> Self { + Self::Partition(pk.into()) + } + + /// Returns a [`QueryScope`] that represents the given feed range, which can be used for partition-specific or cross-partition queries depending on the feed range provided. + /// + /// WARNING: Using a feed range that covers multiple partitions may result in a full scan of those partitions, which can be expensive and slow for large datasets. Use with caution. + pub fn feed_range(fr: FeedRange) -> Self { + Self::FeedRange(fr) + } + + /// Returns a [`QueryScope`] that represents the full container, which is used for cross-partition queries. + /// + /// WARNING: Using this query scope may result in a full scan of the container, which can be expensive and slow for large datasets. Use with caution. + pub fn full_container() -> Self { + Self::FeedRange(FeedRange::full()) + } +} + +impl From for OperationTarget { + fn from(value: QueryScope) -> Self { + match value { + QueryScope::Partition(pk) => Self::PartitionKey(pk), + QueryScope::FeedRange(fr) => Self::FeedRange(fr), + } + } +} + /// Represents a Cosmos DB Query, with optional parameters. /// /// # Examples diff --git a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs index 12cc7f98149..89b78a1b4c5 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs @@ -14,7 +14,8 @@ use azure_data_cosmos::{ clients::DatabaseClient, constants, options::{OperationOptions, QueryOptions}, - PartitionKey, Query, + query::QueryScope, + Query, }; use framework::{test_data, MockItem, TestClient}; use futures::{StreamExt, TryStreamExt}; @@ -37,7 +38,7 @@ async fn execute_query_test( db_client: &DatabaseClient, items: Vec, query: impl Into, - partition_key: impl Into, + scope: QueryScope, expected_items: Vec, options: QueryTestOptions, ) -> Result<(), Box> @@ -52,7 +53,7 @@ where } let mut pages = container_client - .query_items::(query, partition_key, Some(query_options)) + .query_items::(query, scope, Some(query_options)) .await? .into_pages(); @@ -93,7 +94,7 @@ pub async fn single_partition_query_simple() -> Result<(), Box> { db_client, items, "select * from docs c", - "partition0", + QueryScope::partition("partition0"), expected_items, QueryTestOptions::default(), ) @@ -132,7 +133,7 @@ pub async fn single_partition_query_with_parameters() -> Result<(), Box Result<(), Box Result<(), Bo db_client, items, "select value c.id from c where c.mergeOrder between 40 and 60", - (), + QueryScope::full_container(), expected_items, QueryTestOptions::default(), ) @@ -225,7 +226,11 @@ pub async fn cross_partition_query_with_order_by_fails() -> Result<(), Box("select value c.id from c order by c.mergeOrder", (), None) + .query_items::( + "select value c.id from c order by c.mergeOrder", + QueryScope::full_container(), + None, + ) .await?; let result = pager.try_next().await; @@ -279,7 +284,12 @@ pub async fn query_returns_index_and_query_metrics() -> Result<(), Box("select * from c", "partition0", Some(options)).await? + .query_items::( + "select * from c", + QueryScope::partition("partition0"), + Some(options), + ) + .await? .into_pages(); // Get the first page and check metrics headers @@ -353,7 +363,7 @@ pub async fn single_partition_query_pagination() -> Result<(), Box> { db_client, items, "select * from c", - "partition0", + QueryScope::partition("partition0"), expected_items, QueryTestOptions { max_item_count: Some(1), @@ -383,7 +393,7 @@ pub async fn cross_partition_query_pagination() -> Result<(), Box> { db_client, items.clone(), "select * from c", - (), + QueryScope::full_container(), items, QueryTestOptions { max_item_count: Some(1), diff --git a/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs b/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs index 7fdaf242f4b..4a8b3bfa5f2 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/framework/test_client.rs @@ -11,6 +11,7 @@ use azure_data_cosmos::clients::ContainerClient; use azure_data_cosmos::fault_injection::FaultInjectionClientBuilder; use azure_data_cosmos::models::{ItemResponse, ThroughputProperties}; use azure_data_cosmos::options::ItemReadOptions; +use azure_data_cosmos::query::QueryScope; use azure_data_cosmos::Region; use azure_data_cosmos::{ clients::DatabaseClient, ConnectionString, CosmosClient, CreateContainerOptions, PartitionKey, @@ -640,7 +641,11 @@ impl TestRunContext { loop { match container - .query_items::(query.clone(), partition_key.clone(), None) + .query_items::( + query.clone(), + QueryScope::partition(partition_key.clone()), + None, + ) .await { Ok(pager) => match pager.try_collect::>().await { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 0fbf250fb78..b68430a6a22 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -1298,7 +1298,9 @@ impl CosmosDriver { Some(c) => Box::new(CachedTopologyProvider::new( &self.pk_range_cache, c, - |container, continuation| self.fetch_partition_key_ranges(container, continuation), + |container, continuation| { + self.fetch_pk_ranges_from_service(container, continuation) + }, )) as Box, None => Box::new(StubTopologyProvider) as Box, }; @@ -1586,58 +1588,13 @@ impl CosmosDriver { let mut topology = CachedTopologyProvider::new( &self.pk_range_cache, container_ref, - |container, continuation| self.fetch_partition_key_ranges(container, continuation), + |container, continuation| self.fetch_pk_ranges_from_service(container, continuation), ); let pipeline = planner::build_sequential_drain(&query_plan, &mut topology, operation).await?; Ok(OperationPlan::new(pipeline)) } - /// Fetches partition key ranges from the service for the given container. - /// - /// Used as the fetch function for [`CachedTopologyProvider`]. - async fn fetch_partition_key_ranges( - &self, - container: ContainerReference, - continuation: Option, - ) -> Option { - let operation = CosmosOperation::read_partition_key_ranges(container); - let overrides = OperationOverrides { - continuation, - ..Default::default() - }; - let options = OperationOptions::default(); - - let response = self - .execute_operation_direct(&operation, overrides, &options) - .await - .ok()?; - - let not_modified = u16::from(response.status().status_code()) == 304; - let etag_continuation = response - .headers() - .etag - .as_ref() - .map(|e| e.as_str().to_owned()); - - if not_modified { - return Some(PkRangeFetchResult { - ranges: Vec::new(), - continuation: etag_continuation, - not_modified: true, - }); - } - - let pk_ranges_response: crate::models::partition_key_range::PkRangesResponse = - serde_json::from_slice(response.body()).ok()?; - - Some(PkRangeFetchResult { - ranges: pk_ranges_response.partition_key_ranges, - continuation: etag_continuation, - not_modified: false, - }) - } - /// Returns all partition key ranges for a container, ordered by min EPK. /// /// Uses the driver's internal `PartitionKeyRangeCache`. When `force_refresh` diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/components.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/components.rs index ef0ccfcafcd..41161f17508 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/components.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/components.rs @@ -402,6 +402,11 @@ impl TransportResult { _ => None, } } + + /// Returns true if this attempt resulted in a successful HTTP response (2xx). + pub fn is_successful(&self) -> bool { + matches!(self.outcome, TransportOutcome::Success { .. }) + } } /// The outcome of a single transport attempt. @@ -450,9 +455,14 @@ impl std::fmt::Display for TransportOutcome { impl std::fmt::Debug for TransportOutcome { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - TransportOutcome::Success { status, .. } => f + TransportOutcome::Success { + status, + cosmos_headers, + .. + } => f .debug_struct("Success") .field("status", status) + .field("cosmos_headers", &cosmos_headers) .field("body", &"...") .finish(), TransportOutcome::HttpError { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs index ca439a16140..335967c803c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/pipeline/operation_pipeline.rs @@ -23,8 +23,8 @@ use crate::{ driver::transport::CosmosTransport, models::{ cosmos_headers::QUERY_CONTENT_TYPE, request_header_names, AccountEndpoint, ActivityId, - CosmosOperation, CosmosResponse, Credential, DefaultConsistencyLevel, OperationType, - SessionToken, SubStatusCode, + CosmosOperation, CosmosResponse, Credential, DefaultConsistencyLevel, + EffectivePartitionKey, OperationType, SessionToken, SubStatusCode, }, options::{ OperationOptionsView, ReadConsistencyStrategy, Region, ThroughputControlGroupSnapshot, @@ -77,14 +77,18 @@ impl OperationOverrides { headers: &mut azure_core::http::headers::Headers, ) -> azure_core::Result<()> { if let Some(feed_range) = &self.feed_range { - headers.insert( - HeaderName::from_static(request_header_names::START_EPK), - HeaderValue::from(feed_range.min_inclusive().as_str().to_owned()), - ); - headers.insert( - HeaderName::from_static(request_header_names::END_EPK), - HeaderValue::from(feed_range.max_exclusive().as_str().to_owned()), - ); + if feed_range.min_inclusive() != &EffectivePartitionKey::min() { + headers.insert( + HeaderName::from_static(request_header_names::START_EPK), + HeaderValue::from(feed_range.min_inclusive().as_str().to_owned()), + ); + } + if feed_range.max_exclusive() != &EffectivePartitionKey::max() { + headers.insert( + HeaderName::from_static(request_header_names::END_EPK), + HeaderValue::from(feed_range.max_exclusive().as_str().to_owned()), + ); + } } if let Some(pk_range_id) = &self.partition_key_range_id { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs index 5be064ff60c..cd864c3e1a2 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs @@ -305,6 +305,17 @@ pub(crate) async fn execute_transport_pipeline( } let result = result.result; + if result.is_successful() { + tracing::trace!( + ?result.outcome, + "transport attempt complete" + ); + } else { + tracing::warn!( + ?result.outcome, + "transport attempt failed" + ); + } // Check for 429 throttling → transport-level retry let action = evaluate_transport_retry(&result, &throttle_state); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs index 8568d64d207..3b731cad969 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/feed_range.rs @@ -24,6 +24,7 @@ use crate::models::partition_key_range::PartitionKeyRange; /// /// Use [`FeedRange::full()`] for the entire key space (`""..FF`). #[derive(Clone, SafeDebug, PartialEq, Eq, Hash)] +#[safe(true)] pub struct FeedRange { min_inclusive: EffectivePartitionKey, max_exclusive: EffectivePartitionKey, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs index 6bf72d7cb0c..e3cf08d7784 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs @@ -54,6 +54,7 @@ pub(crate) use cosmos_resource_reference::ResourcePaths; pub use cosmos_response::CosmosResponse; pub use cosmos_status::CosmosStatus; pub use cosmos_status::SubStatusCode; +pub use effective_partition_key::EffectivePartitionKey; pub use etag::{ETag, Precondition}; pub use feed_range::FeedRange; pub use operation_target::OperationTarget; From ffd491a12168ea5dac971aaf52b0e2e993f9ad31 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 18:37:21 +0000 Subject: [PATCH 26/29] Fix cross-partition test --- .../tests/emulator_tests/cosmos_query.rs | 60 +- .../src/driver/cosmos_driver.rs | 96 +++- .../src/driver/dataflow/mocks.rs | 2 +- .../src/driver/dataflow/request.rs | 534 ++++++++++++------ .../driver/transport/transport_pipeline.rs | 28 +- 5 files changed, 495 insertions(+), 225 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs index 89b78a1b4c5..ee131e33a99 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs @@ -12,7 +12,7 @@ use azure_core::http::headers::HeaderValue; use azure_core::http::StatusCode; use azure_data_cosmos::{ clients::DatabaseClient, - constants, + constants::{self, SUB_STATUS}, options::{OperationOptions, QueryOptions}, query::QueryScope, Query, @@ -225,32 +225,52 @@ pub async fn cross_partition_query_with_order_by_fails() -> Result<(), Box( "select value c.id from c order by c.mergeOrder", QueryScope::full_container(), None, ) - .await?; - let result = pager.try_next().await; - - let Err(err) = result else { - panic!("expected an error but got a successful result"); + .await + else { + panic!("Expected query to fail due to cross-partition ORDER BY"); }; - assert_eq!(Some(StatusCode::BadRequest), err.http_status()); - - let response = - if let azure_core::error::ErrorKind::HttpResponse { raw_response, .. } = err.kind() - { - raw_response.as_ref().unwrap().clone() - } else { - panic!("expected an HTTP response error"); - }; - let sub_status = response.headers().get_optional_str(&constants::SUB_STATUS); - - // 1004 = CrossPartitionQueryNotServable - assert_eq!(Some("1004"), sub_status); + match err.kind() { + azure_core::error::ErrorKind::HttpResponse { + status, + raw_response, + .. + } => { + assert_eq!( + *status, + StatusCode::BadRequest, + "Expected 400 Bad Request for cross-partition ORDER BY" + ); + let raw_response = raw_response.as_ref().unwrap(); + let body = std::str::from_utf8(raw_response.body()).unwrap(); + #[derive(serde::Deserialize)] + struct ErrorDetail { + code: String, + message: String, + } + let error_detail: ErrorDetail = serde_json::from_str(body).unwrap(); + assert_eq!(error_detail.code, "BadRequest"); + + // Take only the first two lines of the message for comparison, since the full message may contain additional details that could change over time + let clean_message = error_detail + .message + .lines() + .take(2) + .collect::>() + .join("\n"); + assert_eq!( + clean_message, + "Query contains 1 or more unsupported features. Upgrade your SDK to a version that does support the requested features:\nQuery contained OrderBy, which the calling client does not support." + ); + } + _ => panic!("Expected HTTP error response for cross-partition ORDER BY"), + } Ok(()) }, None, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index b68430a6a22..5955f650853 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -54,6 +54,40 @@ struct DriverRequestExecutor<'a> { options: &'a OperationOptions, } +fn request_target_overrides( + target: RequestTarget, + continuation: Option, +) -> OperationOverrides { + match target { + RequestTarget::LogicalPartitionKey(pk) => OperationOverrides { + partition_key: Some(pk), + continuation, + ..Default::default() + }, + RequestTarget::PartitionKeyRange { + partition_key_range_id, + .. + } => OperationOverrides { + partition_key_range_id: Some(partition_key_range_id), + continuation, + ..Default::default() + }, + RequestTarget::EffectivePartitionKeyRange { + range, + partition_key_range_id, + } => OperationOverrides { + partition_key_range_id: Some(partition_key_range_id), + feed_range: Some(range), + continuation, + ..Default::default() + }, + RequestTarget::NonPartitioned => OperationOverrides { + continuation, + ..Default::default() + }, + } +} + impl RequestExecutor for DriverRequestExecutor<'_> { fn execute_request<'a>( &'a mut self, @@ -63,30 +97,11 @@ impl RequestExecutor for DriverRequestExecutor<'_> { continuation: Option, ) -> BoxFuture<'a, azure_core::Result> { let driver = self.driver; - let overrides = match target { - RequestTarget::LogicalPartitionKey(pk) => OperationOverrides { - partition_key: Some(pk), - continuation, - ..Default::default() - }, - RequestTarget::EffectivePartitionKeyRange { - range, - partition_key_range_id, - } => OperationOverrides { - partition_key_range_id: Some(partition_key_range_id.clone()), - feed_range: Some(range), - continuation, - ..Default::default() - }, - RequestTarget::NonPartitioned => OperationOverrides { - continuation, - ..Default::default() - }, - }; + let overrides = request_target_overrides(target, continuation); Box::pin(async move { driver - .execute_operation_direct(operation, overrides, &self.options) + .execute_operation_direct(operation, overrides, self.options) .await }) } @@ -1365,7 +1380,7 @@ impl CosmosDriver { // When partition-level failover is enabled, resolving the range ID // before the first attempt lets the pipeline apply partition overrides // from the very first request instead of only after the first retry. - let pre_resolved_pk_range_id = self.pre_resolve_partition_key_range_id(&operation).await; + let pre_resolved_pk_range_id = self.pre_resolve_partition_key_range_id(operation).await; // Step 6: Select the adaptive transport context for the chosen pipeline let transport = self.transport(); @@ -2452,6 +2467,43 @@ mod tests { Arc::new(serde_json::from_str(MULTI_REGION_ACCOUNT_PROPERTIES).unwrap()) } + #[test] + fn partition_key_range_override_does_not_set_feed_range() { + let overrides = request_target_overrides( + RequestTarget::PartitionKeyRange { + range: crate::models::FeedRange::new( + EffectivePartitionKey::from("10"), + EffectivePartitionKey::from("20"), + ), + partition_key_range_id: "7".to_string(), + }, + Some("ct".to_string()), + ); + + assert_eq!(overrides.partition_key_range_id.as_deref(), Some("7")); + assert_eq!(overrides.continuation.as_deref(), Some("ct")); + assert_eq!(overrides.feed_range, None); + } + + #[test] + fn effective_partition_key_range_override_sets_feed_range() { + let range = crate::models::FeedRange::new( + EffectivePartitionKey::from("10"), + EffectivePartitionKey::from("20"), + ); + let overrides = request_target_overrides( + RequestTarget::EffectivePartitionKeyRange { + range: range.clone(), + partition_key_range_id: "merged".to_string(), + }, + Some("ct".to_string()), + ); + + assert_eq!(overrides.partition_key_range_id.as_deref(), Some("merged")); + assert_eq!(overrides.continuation.as_deref(), Some("ct")); + assert_eq!(overrides.feed_range, Some(range)); + } + #[tokio::test] async fn refresh_falls_back_to_regional_endpoints_when_primary_fails() { // Primary metadata request fails (connection error), then the diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs index 3d770f8c261..74d401b4830 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs @@ -196,7 +196,7 @@ pub(crate) fn logical_partition_target() -> RequestTarget { /// Creates a `RequestTarget` for an EPK range ("" to "80", partition key range ID "0"). pub(crate) fn epk_range_target() -> RequestTarget { - RequestTarget::EffectivePartitionKeyRange { + RequestTarget::PartitionKeyRange { range: FeedRange::new( EffectivePartitionKey::min(), EffectivePartitionKey::from("80"), diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index 937cbf540d3..0afa1e0bf45 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -8,7 +8,9 @@ use azure_core::http::StatusCode; use crate::models::{CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode}; -use super::{ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode}; +use super::{ + ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, ResolvedRange, +}; /// The target of a request node. #[derive(Debug, Clone, PartialEq, Eq)] @@ -19,25 +21,53 @@ pub(crate) enum RequestTarget { /// A single logical partition key. LogicalPartitionKey(PartitionKey), - /// An effective partition key range believed to be in one physical partition. + /// A physical partition key range whose full EPK coverage is owned by this request. + PartitionKeyRange { + /// Full EPK range covered by the physical partition this request owns. + range: FeedRange, + /// Partition key range ID for the owned physical partition. + partition_key_range_id: String, + }, + + /// An EPK slice that must be queried inside a broader physical partition key range. EffectivePartitionKeyRange { /// EPK range scoped by this request. range: FeedRange, - /// Partition key range ID believed to contain `range`. + /// Partition key range ID containing `range`. partition_key_range_id: String, }, } impl RequestTarget { - /// Returns `true` if this target's EPK range starts at the same point as `parent_range`. - fn covers_start_of(&self, parent_range: &FeedRange) -> bool { + /// Returns the EPK slice owned by this request target, if any. + fn owned_range(&self) -> Option<&FeedRange> { match self { - RequestTarget::EffectivePartitionKeyRange { range, .. } => { - range.min_inclusive() == parent_range.min_inclusive() - } - _ => false, + RequestTarget::PartitionKeyRange { range, .. } + | RequestTarget::EffectivePartitionKeyRange { range, .. } => Some(range), + _ => None, } } + + /// Returns `true` if this target's EPK range starts at the same point as `parent_range`. + fn covers_start_of(&self, parent_range: &FeedRange) -> bool { + self.owned_range() + .is_some_and(|range| range.min_inclusive() == parent_range.min_inclusive()) + } +} + +fn intersect_feed_ranges(left: &FeedRange, right: &FeedRange) -> Option { + let min = if left.min_inclusive() >= right.min_inclusive() { + left.min_inclusive().clone() + } else { + right.min_inclusive().clone() + }; + let max = if left.max_exclusive() <= right.max_exclusive() { + left.max_exclusive().clone() + } else { + right.max_exclusive().clone() + }; + + (min < max).then(|| FeedRange::new(min, max)) } #[derive(Debug, PartialEq, Eq)] @@ -80,11 +110,13 @@ impl Request { } } + #[cfg(test)] /// Returns the operation this request node executes. pub(crate) fn operation(&self) -> &CosmosOperation { &self.operation } + #[cfg(test)] /// Returns the target this request node uses for routing. pub(crate) fn target(&self) -> &RequestTarget { &self.target @@ -189,7 +221,8 @@ impl Request { self.handle_response(response) }) } - RequestTarget::EffectivePartitionKeyRange { range, .. } => { + RequestTarget::PartitionKeyRange { range, .. } + | RequestTarget::EffectivePartitionKeyRange { range, .. } => { let range = range.clone(); self.split_for_topology_change(context, &range).await } @@ -210,14 +243,28 @@ impl Request { let replacement_nodes: Vec> = resolved .into_iter() .map(|resolved_range| { - let target = RequestTarget::EffectivePartitionKeyRange { - range: resolved_range.range, - partition_key_range_id: resolved_range.partition_key_range_id, + let ResolvedRange { + partition_key_range_id, + range: resolved_range, + } = resolved_range; + let owned_range = intersect_feed_ranges(&resolved_range, range).expect( + "topology provider must return ranges that overlap the request's owned EPK range", + ); + + let target = if owned_range == resolved_range { + RequestTarget::PartitionKeyRange { + range: resolved_range, + partition_key_range_id, + } + } else { + RequestTarget::EffectivePartitionKeyRange { + range: owned_range, + partition_key_range_id, + } }; // Carry over the server continuation to the first replacement that // covers the same starting EPK. For a split, only the left-most child // inherits the continuation since it resumes where this node left off. - // TODO: When we support streaming ordered merges, we'll need to augment this a bit. let continuation = match (target.covers_start_of(range), &self.state) { ( true, @@ -264,9 +311,218 @@ fn is_partition_topology_change_substatus(substatus: u32) -> bool { #[cfg(test)] mod tests { use super::*; - use crate::driver::dataflow::{mocks::*, ResolvedRange}; + use crate::driver::dataflow::{mocks::*, RequestExecutor, ResolvedRange, TopologyProvider}; use crate::models::{effective_partition_key::EffectivePartitionKey, FeedRange}; + #[derive(Clone, Debug)] + struct PhysicalPartitionSpec { + partition_key_range_id: String, + range: FeedRange, + } + + #[derive(Clone, Debug, PartialEq, Eq)] + struct RequestSpec { + target: RequestTarget, + continuation: Option, + } + + struct ScenarioTopologyProvider { + resolved_ranges: Vec, + } + + impl ScenarioTopologyProvider { + fn new(partitions: &[PhysicalPartitionSpec]) -> Self { + Self { + resolved_ranges: partitions + .iter() + .map(|partition| ResolvedRange { + partition_key_range_id: partition.partition_key_range_id.clone(), + range: partition.range.clone(), + }) + .collect(), + } + } + } + + impl TopologyProvider for ScenarioTopologyProvider { + fn resolve_ranges<'a>( + &'a mut self, + range: &'a FeedRange, + _refresh: PartitionRoutingRefresh, + ) -> futures::future::BoxFuture<'a, azure_core::Result>> { + let resolved = self + .resolved_ranges + .iter() + .filter(|candidate| { + candidate.range.min_inclusive() < range.max_exclusive() + && candidate.range.max_exclusive() > range.min_inclusive() + }) + .cloned() + .collect::>(); + + Box::pin(async move { + if resolved.is_empty() { + Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "scenario topology produced no overlapping ranges", + )) + } else { + Ok(resolved) + } + }) + } + } + + struct AlwaysGoneRequestExecutor; + + impl RequestExecutor for AlwaysGoneRequestExecutor { + fn execute_request<'a>( + &'a mut self, + _operation: &'a CosmosOperation, + _target: RequestTarget, + _partition_routing_refresh: PartitionRoutingRefresh, + _continuation: Option, + ) -> futures::future::BoxFuture<'a, azure_core::Result> { + Box::pin(async { Err(gone_error()) }) + } + } + + fn partition_key_range_target( + min: &str, + max: &str, + partition_key_range_id: &str, + ) -> RequestTarget { + RequestTarget::PartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::from(min), + EffectivePartitionKey::from(max), + ), + partition_key_range_id: partition_key_range_id.to_string(), + } + } + + fn physical_partition( + min: &str, + max: &str, + partition_key_range_id: &str, + ) -> PhysicalPartitionSpec { + PhysicalPartitionSpec { + partition_key_range_id: partition_key_range_id.to_string(), + range: FeedRange::new( + EffectivePartitionKey::from(min), + EffectivePartitionKey::from(max), + ), + } + } + + fn request_spec(target: RequestTarget, continuation: Option<&str>) -> RequestSpec { + RequestSpec { + target, + continuation: continuation.map(str::to_owned), + } + } + + fn partition_key_request( + min: &str, + max: &str, + partition_key_range_id: &str, + continuation: Option<&str>, + ) -> RequestSpec { + request_spec( + partition_key_range_target(min, max, partition_key_range_id), + continuation, + ) + } + + fn effective_partition_key_request( + min: &str, + max: &str, + partition_key_range_id: &str, + continuation: Option<&str>, + ) -> RequestSpec { + request_spec( + effective_partition_key_range_target(min, max, partition_key_range_id), + continuation, + ) + } + + fn build_request(spec: RequestSpec) -> Request { + Request::new(operation(), spec.target, spec.continuation) + } + + fn snapshot_request(request: &Request) -> RequestSpec { + let continuation = match &request.state { + RequestState::Initial => None, + RequestState::Continuing { continuation } => Some(continuation.clone()), + RequestState::Drained => panic!("scenario helper should not produce drained requests"), + }; + + RequestSpec { + target: request.target.clone(), + continuation, + } + } + + async fn apply_topology_round( + requests: Vec, + partitions: &[PhysicalPartitionSpec], + ) -> Vec { + let mut executor = AlwaysGoneRequestExecutor; + let mut topology = ScenarioTopologyProvider::new(partitions); + let mut rewritten = Vec::new(); + + for mut request in requests { + let mut context = PipelineContext::new(&mut executor, &mut topology); + match request.next_page(&mut context).await.unwrap() { + PageResult::SplitRequired { replacement_nodes } => { + rewritten.extend(replacement_nodes.into_iter().map(|node| { + *node + .downcast::() + .expect("scenario helper should only produce request nodes") + })); + } + other => panic!("expected SplitRequired during topology rewrite, got {other:?}"), + } + } + + rewritten + } + + async fn assert_topology_rewrite( + initial_requests: Vec, + topology_rounds: Vec>, + expected_requests: Vec, + ) { + let mut current = initial_requests + .into_iter() + .map(build_request) + .collect::>(); + + // Each round applies a new physical partition layout to the current request list. + // We intentionally do not try to coalesce adjacent requests after repeated topology + // changes; these tests care about correctness of ownership, not optimality. + for partitions in topology_rounds { + current = apply_topology_round(current, &partitions).await; + } + + let actual = current.iter().map(snapshot_request).collect::>(); + assert_eq!(actual, expected_requests); + } + + fn effective_partition_key_range_target( + min: &str, + max: &str, + partition_key_range_id: &str, + ) -> RequestTarget { + RequestTarget::EffectivePartitionKeyRange { + range: FeedRange::new( + EffectivePartitionKey::from(min), + EffectivePartitionKey::from(max), + ), + partition_key_range_id: partition_key_range_id.to_string(), + } + } + #[tokio::test] async fn request_retries_logical_partition_key_topology_change_once() { let mut request = Request::new(operation(), logical_partition_target(), None); @@ -372,170 +628,110 @@ mod tests { assert_eq!(request.state, RequestState::Drained); } - // ── Split recovery tests ────────────────────────────────────────────── + // ── Topology rewrite scenarios ─────────────────────────────────────── #[tokio::test] - async fn epk_range_topology_change_returns_split_required() { - let mut request = Request::new(operation(), epk_range_target(), None); - let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); - let mut topology = MockTopologyProvider::new(vec![Ok(vec![ - ResolvedRange { - partition_key_range_id: "1".to_string(), - range: FeedRange::new( - EffectivePartitionKey::min(), - EffectivePartitionKey::from("40"), - ), - }, - ResolvedRange { - partition_key_range_id: "2".to_string(), - range: FeedRange::new( - EffectivePartitionKey::from("40"), - EffectivePartitionKey::from("80"), - ), - }, - ])]); - let mut context = PipelineContext::new(&mut executor, &mut topology); - - let result = request.next_page(&mut context).await.unwrap(); - match result { - PageResult::SplitRequired { replacement_nodes } => { - assert_eq!(replacement_nodes.len(), 2); - - let r0 = replacement_nodes[0].downcast_ref::().unwrap(); - assert_eq!( - r0.target(), - &RequestTarget::EffectivePartitionKeyRange { - range: FeedRange::new( - EffectivePartitionKey::min(), - EffectivePartitionKey::from("40"), - ), - partition_key_range_id: "1".to_string(), - } - ); - - let r1 = replacement_nodes[1].downcast_ref::().unwrap(); - assert_eq!( - r1.target(), - &RequestTarget::EffectivePartitionKeyRange { - range: FeedRange::new( - EffectivePartitionKey::from("40"), - EffectivePartitionKey::from("80"), - ), - partition_key_range_id: "2".to_string(), - } - ); - } - other => panic!("expected SplitRequired, got {:?}", other), - } + async fn topology_rewrite_handles_simple_split() { + assert_topology_rewrite( + vec![partition_key_request("", "80", "0", Some("server-token"))], + vec![vec![ + physical_partition("", "40", "1"), + physical_partition("40", "80", "2"), + ]], + vec![ + partition_key_request("", "40", "1", Some("server-token")), + partition_key_request("40", "80", "2", None), + ], + ) + .await; } #[tokio::test] - async fn split_left_child_inherits_continuation() { - let mut request = Request::new( - operation(), - epk_range_target(), - Some("server-token".to_string()), - ); - let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); - let mut topology = MockTopologyProvider::new(vec![Ok(vec![ - ResolvedRange { - partition_key_range_id: "1".to_string(), - range: FeedRange::new( - EffectivePartitionKey::min(), - EffectivePartitionKey::from("40"), - ), - }, - ResolvedRange { - partition_key_range_id: "2".to_string(), - range: FeedRange::new( - EffectivePartitionKey::from("40"), - EffectivePartitionKey::from("80"), - ), - }, - ])]); - let mut context = PipelineContext::new(&mut executor, &mut topology); - - let result = request.next_page(&mut context).await.unwrap(); - match result { - PageResult::SplitRequired { replacement_nodes } => { - let left = replacement_nodes[0].downcast_ref::().unwrap(); - assert_eq!( - left.state, - RequestState::Continuing { - continuation: "server-token".to_string() - }, - "left-most child should inherit the server continuation" - ); + async fn topology_rewrite_handles_simple_merge() { + assert_topology_rewrite( + vec![ + partition_key_request("", "40", "left", Some("merge-token")), + partition_key_request("40", "80", "right", None), + ], + vec![vec![physical_partition("", "80", "merged")]], + vec![ + effective_partition_key_request("", "40", "merged", Some("merge-token")), + effective_partition_key_request("40", "80", "merged", None), + ], + ) + .await; + } - let right = replacement_nodes[1].downcast_ref::().unwrap(); - assert_eq!( - right.state, - RequestState::Initial, - "non-left children should have no continuation" - ); - } - other => panic!("expected SplitRequired, got {:?}", other), - } + #[tokio::test] + async fn topology_rewrite_leaves_unchanged_neighbors_alone() { + assert_topology_rewrite( + vec![ + partition_key_request("", "40", "left", Some("ct")), + partition_key_request("40", "80", "right", None), + ], + vec![vec![ + physical_partition("", "40", "left"), + physical_partition("40", "60", "right-a"), + physical_partition("60", "80", "right-b"), + ]], + vec![ + partition_key_request("", "40", "left", Some("ct")), + partition_key_request("40", "60", "right-a", None), + partition_key_request("60", "80", "right-b", None), + ], + ) + .await; } #[tokio::test] - async fn split_three_way_only_left_inherits_continuation() { - let range = FeedRange::new( - EffectivePartitionKey::from("10"), - EffectivePartitionKey::from("90"), - ); - let mut request = Request::new( - operation(), - RequestTarget::EffectivePartitionKeyRange { - range: range.clone(), - partition_key_range_id: "0".to_string(), - }, - Some("ct".to_string()), - ); - let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); - let mut topology = MockTopologyProvider::new(vec![Ok(vec![ - ResolvedRange { - partition_key_range_id: "1".to_string(), - range: FeedRange::new( - EffectivePartitionKey::from("10"), - EffectivePartitionKey::from("40"), - ), - }, - ResolvedRange { - partition_key_range_id: "2".to_string(), - range: FeedRange::new( - EffectivePartitionKey::from("40"), - EffectivePartitionKey::from("70"), - ), - }, - ResolvedRange { - partition_key_range_id: "3".to_string(), - range: FeedRange::new( - EffectivePartitionKey::from("70"), - EffectivePartitionKey::from("90"), - ), - }, - ])]); - let mut context = PipelineContext::new(&mut executor, &mut topology); + async fn topology_rewrite_can_return_from_merged_epk_slices_to_exact_pk_ranges() { + assert_topology_rewrite( + vec![ + effective_partition_key_request("", "40", "merged", Some("ct")), + effective_partition_key_request("40", "80", "merged", None), + ], + vec![vec![ + physical_partition("", "40", "left"), + physical_partition("40", "80", "right"), + ]], + vec![ + partition_key_request("", "40", "left", Some("ct")), + partition_key_request("40", "80", "right", None), + ], + ) + .await; + } - let result = request.next_page(&mut context).await.unwrap(); - match result { - PageResult::SplitRequired { replacement_nodes } => { - assert_eq!(replacement_nodes.len(), 3); - let left = replacement_nodes[0].downcast_ref::().unwrap(); - assert_eq!( - left.state, - RequestState::Continuing { - continuation: "ct".to_string() - } - ); - let mid = replacement_nodes[1].downcast_ref::().unwrap(); - assert_eq!(mid.state, RequestState::Initial); - let right = replacement_nodes[2].downcast_ref::().unwrap(); - assert_eq!(right.state, RequestState::Initial); - } - other => panic!("expected SplitRequired, got {:?}", other), - } + #[tokio::test] + async fn topology_rewrite_handles_merge_then_different_split_mid_pipeline() { + assert_topology_rewrite( + vec![ + partition_key_request("00", "20", "a", Some("ct")), + partition_key_request("20", "40", "b", None), + partition_key_request("40", "80", "c", None), + ], + vec![ + vec![ + physical_partition("00", "40", "merged-left"), + physical_partition("40", "80", "c"), + ], + vec![ + physical_partition("00", "10", "split-a"), + physical_partition("10", "30", "split-b"), + physical_partition("30", "50", "split-c"), + physical_partition("50", "80", "split-d"), + ], + ], + vec![ + partition_key_request("00", "10", "split-a", Some("ct")), + effective_partition_key_request("10", "20", "split-b", None), + effective_partition_key_request("20", "30", "split-b", None), + effective_partition_key_request("30", "40", "split-c", None), + effective_partition_key_request("40", "50", "split-c", None), + partition_key_request("50", "80", "split-d", None), + ], + ) + .await; } #[tokio::test] diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs index cd864c3e1a2..7be9c8e809a 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/transport/transport_pipeline.rs @@ -288,6 +288,20 @@ pub(crate) async fn execute_transport_pipeline( outcome = ?result.result.outcome, "transport request complete" ); + if result.result.is_successful() { + tracing::trace!( + ?result.result.outcome, + "transport attempt complete" + ); + } else if let TransportOutcome::HttpError { status, body, .. } = &result.result.outcome { + let body_str = String::from_utf8_lossy(body); + tracing::warn!(%status, "transport request resulted in HTTP error: {}", body_str); + } else { + tracing::warn!( + ?result.result.outcome, + "transport attempt failed" + ); + } if result.shard_id.is_some_and(|failed_shard_id| { local_connectivity_retry_count < MAX_LOCAL_CONNECTIVITY_RETRIES @@ -304,20 +318,8 @@ pub(crate) async fn execute_transport_pipeline( continue; } - let result = result.result; - if result.is_successful() { - tracing::trace!( - ?result.outcome, - "transport attempt complete" - ); - } else { - tracing::warn!( - ?result.outcome, - "transport attempt failed" - ); - } - // Check for 429 throttling → transport-level retry + let result = result.result; let action = evaluate_transport_retry(&result, &throttle_state); match action { ThrottleAction::Retry { delay, new_state } => { From 73f08ad340026fe692b65f9d72520a3abad6acaf Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 21:20:05 +0000 Subject: [PATCH 27/29] Continuation tokens --- .../src/clients/container_client.rs | 9 +- .../src/clients/cosmos_client.rs | 2 +- .../src/clients/database_client.rs | 2 +- sdk/cosmos/azure_data_cosmos/src/feed.rs | 298 +++++++++++----- sdk/cosmos/azure_data_cosmos/src/lib.rs | 1 + .../azure_data_cosmos/src/options/mod.rs | 18 +- sdk/cosmos/azure_data_cosmos/src/query.rs | 1 + .../tests/emulator_tests/cosmos_query.rs | 336 +++++++++++++++++- .../src/driver/cosmos_driver.rs | 51 ++- .../src/driver/dataflow/drain.rs | 252 +++++++++++-- .../src/driver/dataflow/drained.rs | 38 ++ .../src/driver/dataflow/mocks.rs | 23 +- .../src/driver/dataflow/mod.rs | 97 ++++- .../src/driver/dataflow/planner.rs | 307 ++++++++++++++-- .../src/driver/dataflow/request.rs | 29 +- .../src/driver/dataflow/snapshot.rs | 44 +++ .../src/models/continuation_token.rs | 323 +++++++++++++++++ .../src/models/cosmos_headers.rs | 43 +++ .../src/models/cosmos_operation.rs | 9 + .../src/models/mod.rs | 3 + 20 files changed, 1701 insertions(+), 185 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/snapshot.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs index 9497cb35186..c09a76232ec 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/container_client.rs @@ -730,10 +730,17 @@ impl ContainerClient { if let Some(token) = options.session_token { initial_operation = initial_operation.with_session_token(token); } + if let Some(max_item_count) = options.max_item_count { + initial_operation = initial_operation.with_max_item_count(max_item_count); + } let plan = self .context .driver - .plan_operation(&initial_operation, &options.operation) + .plan_operation( + &initial_operation, + &options.operation, + options.continuation_token.as_ref(), + ) .await?; Ok(FeedItemIterator::new( self.context.driver.clone(), diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs index fb79f2d066f..abe23ac2bef 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/cosmos_client.rs @@ -139,7 +139,7 @@ impl CosmosClient { let plan = self .context .driver - .plan_operation(&initial_operation, &operation_options) + .plan_operation(&initial_operation, &operation_options, None) .await?; Ok(FeedItemIterator::new( diff --git a/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs b/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs index 2a0e0a97c9e..c3cd6b613cf 100644 --- a/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs +++ b/sdk/cosmos/azure_data_cosmos/src/clients/database_client.rs @@ -129,7 +129,7 @@ impl DatabaseClient { let plan = self .context .driver - .plan_operation(&initial_operation, &operation_options) + .plan_operation(&initial_operation, &operation_options, None) .await?; Ok(FeedItemIterator::new( diff --git a/sdk/cosmos/azure_data_cosmos/src/feed.rs b/sdk/cosmos/azure_data_cosmos/src/feed.rs index ef504e624cf..ddda7133ff6 100644 --- a/sdk/cosmos/azure_data_cosmos/src/feed.rs +++ b/sdk/cosmos/azure_data_cosmos/src/feed.rs @@ -1,25 +1,25 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -use std::{pin::Pin, result, sync::Arc, task}; +use std::{marker::PhantomData, pin::Pin, sync::Arc, task}; use azure_core::http::{ headers::Headers, pager::{PagerContinuation, PagerResult}, }; use azure_data_cosmos_driver::{ - models::{ContainerReference, CosmosResponseHeaders}, + models::{ContainerReference, CosmosResponse as DriverResponse, CosmosResponseHeaders}, options::OperationOptions, CosmosDriver, OperationPlan, }; -use futures::stream::BoxStream; +use futures::future::BoxFuture; use futures::Stream; use serde::{de::DeserializeOwned, Deserialize}; use crate::{ constants, driver_bridge, models::{CosmosDiagnostics, CosmosResponse}, - SessionToken, + ContinuationToken, SessionToken, }; /// Represents a single page of results from a Cosmos DB feed. @@ -263,69 +263,150 @@ impl QueryFeedPage { } } -fn create_pagination_stream( +type DriverPageFuture = + BoxFuture<'static, (OperationPlan, azure_core::Result>)>; + +/// Live pipeline state held by [`FeedPageIterator`] / [`FeedItemIterator`]. +/// +/// Owns the [`OperationPlan`] directly (rather than burying it inside an +/// `unfold` closure) so that +/// [`FeedPageIterator::to_continuation_token`] can snapshot it between polls. +struct LiveState { driver: Arc, container: Option, - plan: OperationPlan, options: OperationOptions, -) -> BoxStream<'static, azure_core::Result>> { - struct State { + /// Always `Some` while no page fetch is in flight. + plan: Option, + /// `Some` while a page fetch is pending. + in_flight: Option, + exhausted: bool, +} + +impl LiveState { + fn new( driver: Arc, container: Option, plan: OperationPlan, options: OperationOptions, - continuation: Option, + ) -> Self { + Self { + driver, + container, + options, + plan: Some(plan), + in_flight: None, + exhausted: false, + } } - let initial_state = State { - driver, - container, - options, - plan, - continuation: None, - }; - let stream = futures::stream::unfold(Some(initial_state), |state| async move { - let Some(mut state) = state else { - return None; // No more pages to fetch + + fn poll_next_page( + &mut self, + cx: &mut task::Context<'_>, + ) -> task::Poll>>> { + if self.exhausted { + return task::Poll::Ready(None); + } + + if self.in_flight.is_none() { + // Move the plan into a future. The future returns the plan back so + // we can store it again between polls. + let mut plan = self + .plan + .take() + .expect("plan must be present between polls"); + let driver = Arc::clone(&self.driver); + let container = self.container.clone(); + let options = self.options.clone(); + let fut: DriverPageFuture = Box::pin(async move { + let result = driver.execute_plan(&mut plan, container, options).await; + (plan, result) + }); + self.in_flight = Some(fut); + } + + let fut = self.in_flight.as_mut().expect("future just installed"); + let (plan, result) = match fut.as_mut().poll(cx) { + task::Poll::Pending => return task::Poll::Pending, + task::Poll::Ready(out) => out, }; + self.in_flight = None; + self.plan = Some(plan); - let result = state - .driver - .execute_plan( - &mut state.plan, - state.container.clone(), - state.options.clone(), + match result { + Ok(None) => { + self.exhausted = true; + task::Poll::Ready(None) + } + Err(err) => { + self.exhausted = true; + task::Poll::Ready(Some(Err(err))) + } + Ok(Some(driver_response)) => { + let response = driver_bridge::driver_response_to_cosmos_response::>( + driver_response, + ); + match QueryFeedPage::from_response(response) { + Ok(page) => task::Poll::Ready(Some(Ok(page))), + Err(err) => { + self.exhausted = true; + task::Poll::Ready(Some(Err(err))) + } + } + } + } + } + + fn to_continuation_token(&self) -> azure_core::Result { + let plan = self.plan.as_ref().ok_or_else(|| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "to_continuation_token called while a page fetch is in flight", ) - .await; - let driver_response = match result { - Ok(None) => return None, // No more pages to fetch - Err(err) => return Some((Err(err), None)), // Propagate error, terminates the stream after this response. - Ok(Some(r)) => r, - }; + })?; + plan.to_continuation_token() + } +} - // Parse the response into a page - let response = - driver_bridge::driver_response_to_cosmos_response::>(driver_response); - let page = match QueryFeedPage::from_response(response) { - Ok(page) => page, - Err(err) => return Some((Err(err), None)), // Propagate error, terminates the stream after this response. - }; - Some((Ok(page), Some(state))) - }); - Box::pin(stream) +/// Internal source of pages for [`FeedPageIterator`] and [`FeedItemIterator`]. +/// +/// Production iterators use the [`Live`](Self::Live) variant which drives the +/// underlying [`OperationPlan`]. Unit tests use [`Synthetic`](Self::Synthetic) +/// to inject a pre-built sequence of pages. +enum PageSource { + Live(LiveState), + #[cfg(test)] + Synthetic(std::collections::VecDeque>>), + #[cfg(not(test))] + #[allow(dead_code)] + _Phantom(PhantomData T>), +} + +impl PageSource { + fn poll_next_page( + &mut self, + cx: &mut task::Context<'_>, + ) -> task::Poll>>> { + match self { + PageSource::Live(state) => state.poll_next_page::(cx), + #[cfg(test)] + PageSource::Synthetic(pages) => task::Poll::Ready(pages.pop_front()), + #[cfg(not(test))] + PageSource::_Phantom(_) => task::Poll::Ready(None), + } + } } /// Represents a stream of items from a Cosmos DB query. /// /// See [`QueryFeedPage`] for more details on Cosmos DB feeds. -#[pin_project::pin_project] pub struct FeedItemIterator { - #[pin] - pages: BoxStream<'static, azure_core::Result>>, + source: PageSource, current: Option>, + _marker: PhantomData T>, } impl FeedItemIterator { - /// Creates a new `FeedItemIterator` from a stream of pages. + /// Creates a new `FeedItemIterator` backed by the given operation plan. pub(crate) fn new( driver: Arc, container: Option, @@ -333,63 +414,106 @@ impl FeedItemIterator { options: OperationOptions, ) -> Self { Self { - pages: create_pagination_stream(driver, container, plan, options), + source: PageSource::Live(LiveState::new(driver, container, plan, options)), current: None, + _marker: PhantomData, } } - /// Converts this item iterator into a page iterator, yielding full pages instead of individual items. + /// Converts this item iterator into a page iterator, yielding full pages + /// instead of individual items. /// - /// IMPORTANT: This will DISCARD any items from the current page that have not yet been yielded by the item iterator. - /// Use this method before consuming any items if you want to switch to page-based iteration. + /// IMPORTANT: This will DISCARD any items from the current page that have + /// not yet been yielded by the item iterator. Use this method before + /// consuming any items if you want to switch to page-based iteration. pub fn into_pages(self) -> FeedPageIterator { - FeedPageIterator(self.pages) + FeedPageIterator { + source: self.source, + _marker: PhantomData, + } } } -impl Stream for FeedItemIterator { +impl Stream for FeedItemIterator { type Item = azure_core::Result; fn poll_next( self: Pin<&mut Self>, cx: &mut task::Context<'_>, ) -> task::Poll> { - let mut this = self.project(); + // Safety: we never move the inner source/current out via Pin. + let this = unsafe { self.get_unchecked_mut() }; loop { if let Some(current) = this.current.as_mut() { if let Some(item) = current.next() { return task::Poll::Ready(Some(Ok(item))); } - - // Reset the iterator and poll for the next page. - *this.current = None; + this.current = None; } - match this.pages.as_mut().poll_next(cx) { - task::Poll::Ready(page) => match page { - Some(Ok(page)) => { - *this.current = Some(page.page.items.into_iter()); - continue; - } - Some(Err(err)) => return task::Poll::Ready(Some(Err(err))), - None => return task::Poll::Ready(None), - }, + match this.source.poll_next_page(cx) { + task::Poll::Ready(Some(Ok(page))) => { + this.current = Some(page.into_items().into_iter()); + continue; + } + task::Poll::Ready(Some(Err(err))) => return task::Poll::Ready(Some(Err(err))), + task::Poll::Ready(None) => return task::Poll::Ready(None), task::Poll::Pending => return task::Poll::Pending, } } } } -pub struct FeedPageIterator(BoxStream<'static, azure_core::Result>>); +/// A stream of pages from a Cosmos DB feed operation. +/// +/// In addition to yielding [`QueryFeedPage`]s like a regular `Stream`, this +/// iterator can be snapshotted into a [`ContinuationToken`] for later +/// resumption via +/// [`to_continuation_token`](Self::to_continuation_token). +pub struct FeedPageIterator { + source: PageSource, + _marker: PhantomData T>, +} + +impl FeedPageIterator { + /// Captures the current iterator position as a [`ContinuationToken`]. + /// + /// Pass the returned token to a subsequent + /// [`ContainerClient::query_items`](crate::clients::ContainerClient::query_items) + /// call (via [`QueryOptions::with_continuation_token`](crate::QueryOptions::with_continuation_token)) + /// to resume the query at the same position. + /// + /// Snapshotting is non-mutating; the iterator may continue to be used + /// afterwards. + /// + /// # Errors + /// + /// Returns an error if a page fetch is currently in flight (the plan + /// state is being mutated and cannot be safely snapshotted). + pub fn to_continuation_token(&self) -> azure_core::Result { + match &self.source { + PageSource::Live(state) => state.to_continuation_token(), + #[cfg(test)] + PageSource::Synthetic(_) => Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "synthetic test iterator does not support to_continuation_token", + )), + #[cfg(not(test))] + PageSource::_Phantom(_) => unreachable!(), + } + } +} -impl Stream for FeedPageIterator { +impl Stream for FeedPageIterator { type Item = azure_core::Result>; fn poll_next( - mut self: Pin<&mut Self>, + self: Pin<&mut Self>, cx: &mut task::Context<'_>, ) -> task::Poll> { - self.0.as_mut().poll_next(cx) + // Safety: we never move source out via Pin. + let this = unsafe { self.get_unchecked_mut() }; + this.source.poll_next_page(cx) } } @@ -412,6 +536,16 @@ mod tests { } } + fn synthetic_item_iter( + pages: Vec>>, + ) -> FeedItemIterator { + FeedItemIterator { + source: PageSource::Synthetic(pages.into()), + current: None, + _marker: PhantomData, + } + } + #[tokio::test] async fn item_iterator_yields_all_items_from_multiple_pages() { let pages = vec![ @@ -420,12 +554,7 @@ mod tests { Ok(create_test_page(vec![6], None)), ]; - let stream = futures::stream::iter(pages); - let item_iter = FeedItemIterator { - pages: Box::pin(stream), - current: None, - }; - + let item_iter = synthetic_item_iter(pages); let items: Vec<_> = item_iter .collect::>() .await @@ -442,13 +571,7 @@ mod tests { Ok(create_test_page(vec![3], None)), ]; - let stream = futures::stream::iter(pages); - let page_iter = FeedItemIterator { - pages: Box::pin(stream), - current: None, - } - .into_pages(); - + let page_iter = synthetic_item_iter(pages).into_pages(); let page_items: Vec<_> = page_iter .collect::>() .await @@ -468,11 +591,7 @@ mod tests { )), ]; - let stream = futures::stream::iter(pages); - let mut item_iter = FeedItemIterator { - pages: Box::pin(stream), - current: None, - }; + let mut item_iter = synthetic_item_iter(pages); // First two items should succeed assert_eq!(item_iter.next().await.unwrap().unwrap(), 1); @@ -490,12 +609,7 @@ mod tests { Ok(create_test_page(vec![2], None)), ]; - let stream = futures::stream::iter(pages); - let item_iter = FeedItemIterator { - pages: Box::pin(stream), - current: None, - }; - + let item_iter = synthetic_item_iter(pages); let items: Vec<_> = item_iter .collect::>() .await diff --git a/sdk/cosmos/azure_data_cosmos/src/lib.rs b/sdk/cosmos/azure_data_cosmos/src/lib.rs index 33af05d6edc..196302222d1 100644 --- a/sdk/cosmos/azure_data_cosmos/src/lib.rs +++ b/sdk/cosmos/azure_data_cosmos/src/lib.rs @@ -29,6 +29,7 @@ pub use clients::CosmosClientBuilder; pub use account_endpoint::CosmosAccountEndpoint; pub use account_reference::CosmosAccountReference; #[doc(inline)] +pub use azure_data_cosmos_driver::models::ContinuationToken; pub use azure_data_cosmos_driver::models::FeedRange; #[doc(inline)] pub use azure_data_cosmos_driver::models::PartitionKey; diff --git a/sdk/cosmos/azure_data_cosmos/src/options/mod.rs b/sdk/cosmos/azure_data_cosmos/src/options/mod.rs index e6c5c705f38..ec97facd710 100644 --- a/sdk/cosmos/azure_data_cosmos/src/options/mod.rs +++ b/sdk/cosmos/azure_data_cosmos/src/options/mod.rs @@ -2,6 +2,7 @@ // Licensed under the MIT License. use crate::models::ThroughputProperties; +use crate::ContinuationToken; use azure_core::http::headers::Headers; use std::fmt; use std::fmt::Display; @@ -270,9 +271,18 @@ pub struct QueryOptions { /// Maximum number of items to return per page. /// /// When set, the server will return at most this many items in each response page. - /// This is useful for controlling memory usage and for testing pagination behavior. /// If not set, the server uses its default page size. + /// + /// This is a _hint_ to the server, not a client-side guarantee of the maximum returned page size. + /// In a cross-partition query, each partition may return up to this many items, + /// so the total page size could be up to this value times the number of partitions involved. + /// Some server operations may return fewer, or even more, items than this value based on internal heuristics. pub max_item_count: Option, + + /// Continuation token from a prior page iterator, used to resume the query. + /// + /// See [`FeedPageIterator::to_continuation_token`](crate::FeedPageIterator::to_continuation_token). + pub continuation_token: Option, } impl QueryOptions { @@ -293,6 +303,12 @@ impl QueryOptions { self.max_item_count = Some(max_item_count); self } + + /// Sets a continuation token to resume the query at a previous position. + pub fn with_continuation_token(mut self, continuation_token: ContinuationToken) -> Self { + self.continuation_token = Some(continuation_token); + self + } } /// Options to be passed to [`ContainerClient::read()`](crate::clients::ContainerClient::read()). diff --git a/sdk/cosmos/azure_data_cosmos/src/query.rs b/sdk/cosmos/azure_data_cosmos/src/query.rs index fe0bf812d8f..4aebb661955 100644 --- a/sdk/cosmos/azure_data_cosmos/src/query.rs +++ b/sdk/cosmos/azure_data_cosmos/src/query.rs @@ -12,6 +12,7 @@ use serde::Serialize; /// so it is important to choose the appropriate scope for your query to ensure it is executed efficiently. /// Queries that cross physical partition boundaries require the client to fan out the query to /// multiple partitions and aggregate the results, which can be expensive and slow for large datasets. +#[derive(Clone)] pub enum QueryScope { Partition(PartitionKey), FeedRange(FeedRange), diff --git a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs index ee131e33a99..22ceba8185d 100644 --- a/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs +++ b/sdk/cosmos/azure_data_cosmos/tests/emulator_tests/cosmos_query.rs @@ -15,7 +15,7 @@ use azure_data_cosmos::{ constants::{self, SUB_STATUS}, options::{OperationOptions, QueryOptions}, query::QueryScope, - Query, + ContinuationToken, Query, }; use framework::{test_data, MockItem, TestClient}; use futures::{StreamExt, TryStreamExt}; @@ -46,25 +46,67 @@ where T: DeserializeOwned + Send + Eq + std::fmt::Debug + 'static, { let container_client = test_data::create_container_with_items(db_client, items, None).await?; + let query: Query = query.into(); - let mut query_options = QueryOptions::default(); - if let Some(max_item_count) = options.max_item_count { - query_options = query_options.with_max_item_count(max_item_count); - } - - let mut pages = container_client - .query_items::(query, scope, Some(query_options)) - .await? - .into_pages(); + let build_options = || -> QueryOptions { + let mut o = QueryOptions::default(); + if let Some(max_item_count) = options.max_item_count { + o = o.with_max_item_count(max_item_count); + } + o + }; let mut actual_items = Vec::new(); - while let Some(page) = pages.next().await { - actual_items.extend(page?.into_items()); - } if options.use_continuation_token_resume { - // Placeholder for future continuation token-based resume support. - panic!("Continuation token resume support not yet implemented"); + // Fetch one page at a time, taking a continuation token after each + // page and resuming a brand-new iterator from the token. This + // exercises the suspend/resume path end-to-end. + let mut continuation: Option = None; + loop { + let mut query_options = build_options(); + if let Some(token) = continuation.take() { + query_options = query_options.with_continuation_token(token); + } + let mut pages = container_client + .query_items::(query.clone(), scope.clone(), Some(query_options)) + .await? + .into_pages(); + + let Some(page) = pages.next().await else { + break; + }; + let page = page?; + actual_items.extend(page.into_items()); + + // Round-trip the continuation token through string form to + // mimic real usage (e.g. persisting it across processes). + let token = pages.to_continuation_token()?; + let serialized = token.as_str().to_owned(); + let restored = ContinuationToken::from_string(serialized); + // Drop the iterator before checking for termination — we want to + // observe the snapshot taken right after the page was emitted. + drop(pages); + + // The pipeline reports its own terminal state via + // `to_continuation_token` returning a token whose decoded + // snapshot is `Drained`. We can't introspect that here, so we + // detect termination by attempting one more poll on a fresh + // iterator: if it yields no page, we're done. + // + // To avoid an extra round-trip when the snapshot is trivially + // drained, we still always set `continuation` and let the + // planner short-circuit to a `DrainedLeaf`. + continuation = Some(restored); + } + } else { + let mut pages = container_client + .query_items::(query, scope, Some(build_options())) + .await? + .into_pages(); + while let Some(page) = pages.next().await { + actual_items.extend(page?.into_items()); + } } assert_eq!(expected_items, actual_items); @@ -428,3 +470,267 @@ pub async fn cross_partition_query_pagination() -> Result<(), Box> { ) .await } + +#[tokio::test] +#[cfg_attr( + not(test_category = "emulator"), + ignore = "requires test_category 'emulator'" +)] +pub async fn cross_partition_query_suspend_resume() -> Result<(), Box> { + TestClient::run_with_unique_db( + async |_, db_client| { + // Four logical partitions × three items per partition. With a + // page size of one, this exercises both intra-partition and + // cross-partition resume points. + let items = test_data::generate_mock_items(4, 3); + + execute_query_test( + db_client, + items.clone(), + "select * from c", + QueryScope::full_container(), + items, + QueryTestOptions { + max_item_count: Some(1), + use_continuation_token_resume: true, + }, + ) + .await?; + + Ok(()) + }, + None, + ) + .await +} + +#[tokio::test] +#[cfg_attr( + not(test_category = "emulator"), + ignore = "requires test_category 'emulator'" +)] +pub async fn query_rejects_newer_sdk_continuation_token() -> Result<(), Box> { + TestClient::run_with_unique_db( + async |_, db_client| { + let items = test_data::generate_mock_items(1, 1); + let container_client = + test_data::create_container_with_items(db_client, items, None).await?; + + // A `c2.` prefix indicates the token was issued by a future + // SDK version this client does not understand. + let token = ContinuationToken::from_string("c2.something".to_string()); + let options = QueryOptions::default().with_continuation_token(token); + + let Err(err) = container_client + .query_items::( + "select * from c", + QueryScope::full_container(), + Some(options), + ) + .await + else { + panic!("expected newer-SDK token to be rejected"); + }; + let message = err.to_string(); + assert!( + message.contains("newer SDK") || message.contains("c2"), + "unexpected error: {message}" + ); + + Ok(()) + }, + None, + ) + .await +} + +#[tokio::test] +#[cfg_attr( + not(test_category = "emulator"), + ignore = "requires test_category 'emulator'" +)] +pub async fn query_rejects_server_token_for_cross_partition() -> Result<(), Box> { + TestClient::run_with_unique_db( + async |_, db_client| { + let items = test_data::generate_mock_items(2, 1); + let container_client = + test_data::create_container_with_items(db_client, items, None).await?; + + // An un-prefixed token is treated as an opaque server + // continuation, which is only valid for trivial (single- + // partition) queries. + let token = ContinuationToken::from_string("opaque-server-blob".to_string()); + let options = QueryOptions::default().with_continuation_token(token); + + let Err(err) = container_client + .query_items::( + "select * from c", + QueryScope::full_container(), + Some(options), + ) + .await + else { + panic!("expected opaque server token to be rejected for cross-partition query"); + }; + let message = err.to_string(); + assert!( + message.contains("opaque server continuation token"), + "unexpected error: {message}" + ); + + Ok(()) + }, + None, + ) + .await +} + +#[tokio::test] +#[cfg_attr( + not(test_category = "emulator"), + ignore = "requires test_category 'emulator'" +)] +pub async fn single_partition_query_resumes_with_raw_server_token() -> Result<(), Box> { + use base64::engine::general_purpose::URL_SAFE_NO_PAD; + use base64::Engine as _; + + TestClient::run_with_unique_db( + async |_, db_client| { + // One logical partition × five items so we get multiple pages + // with `max_item_count(1)`. + let items = test_data::generate_mock_items(1, 5); + let expected: Vec = + collect_matching_items(&items, |p| p.partition_key == "partition0"); + assert!( + expected.len() > 1, + "need multiple items to exercise pagination" + ); + + let container_client = + test_data::create_container_with_items(db_client, items, None).await?; + let scope = QueryScope::partition("partition0"); + + // --- Round 1: fetch the first page through the SDK and pull + // the SDK-issued `c1.` token. --- + let mut pages = container_client + .query_items::( + "select * from c", + scope.clone(), + Some(QueryOptions::default().with_max_item_count(1)), + ) + .await? + .into_pages(); + + let first_page = pages + .next() + .await + .expect("expected at least one page from the server")?; + let mut actual: Vec = first_page.into_items(); + + let token = pages.to_continuation_token()?; + let raw = token.as_str().to_owned(); + drop(pages); + + assert!( + raw.starts_with("c1."), + "expected SDK to emit a c1.-prefixed token, got: {raw}" + ); + + // Crack the SDK token open. We deliberately couple this test + // to the on-the-wire format so we can recover the underlying + // server continuation without exposing extra public APIs. + // + // Format: `c1.` + base64url-no-pad(JSON of `PipelineNodeState`). + // For a trivial single-partition query the JSON is shaped like + // `{"kind":"request","server_continuation":""}`. + let payload = raw.strip_prefix("c1.").unwrap(); + let json_bytes = URL_SAFE_NO_PAD + .decode(payload) + .expect("c1. payload must be valid base64url-no-pad"); + let snapshot: serde_json::Value = serde_json::from_slice(&json_bytes) + .expect("decoded c1. payload must be valid JSON"); + assert_eq!( + snapshot.get("kind").and_then(|v| v.as_str()), + Some("request"), + "trivial single-partition pipeline should snapshot as a single Request node, got: {snapshot}" + ); + let server_token = snapshot + .get("server_continuation") + .and_then(|v| v.as_str()) + .expect("Request node must carry a server_continuation after the first page") + .to_owned(); + assert!( + !server_token.is_empty(), + "server continuation token should not be empty" + ); + assert!( + !server_token.starts_with("c1.") && !server_token.starts_with("c2."), + "server continuation must not look like an SDK token, got: {server_token}" + ); + + // --- Round 2: drain the rest of the query using the raw + // server token directly (no `c1.` prefix). The SDK accepts + // un-prefixed tokens as an opaque server fallback for trivial + // single-partition queries. --- + let mut continuation = Some(ContinuationToken::from_string(server_token)); + let mut page_count: usize = 1; + loop { + let mut options = QueryOptions::default().with_max_item_count(1); + if let Some(t) = continuation.take() { + options = options.with_continuation_token(t); + } + + let mut pages = container_client + .query_items::("select * from c", scope.clone(), Some(options)) + .await? + .into_pages(); + + let Some(page) = pages.next().await else { + break; + }; + let page = page?; + let items_in_page = page.into_items(); + let was_empty = items_in_page.is_empty(); + actual.extend(items_in_page); + page_count += 1; + + let next_token = pages.to_continuation_token()?; + let raw_next = next_token.as_str().to_owned(); + drop(pages); + + // Subsequent SDK-issued tokens must still be `c1.`-prefixed. + assert!( + raw_next.starts_with("c1."), + "follow-up token must remain c1.-prefixed, got: {raw_next}" + ); + + // Decode again to detect end-of-stream: when the inner + // snapshot is `{"kind":"drained"}` we are done. + let payload = raw_next.strip_prefix("c1.").unwrap(); + let json_bytes = URL_SAFE_NO_PAD + .decode(payload) + .expect("c1. payload must be valid base64url-no-pad"); + let snapshot: serde_json::Value = + serde_json::from_slice(&json_bytes).expect("payload must be valid JSON"); + let kind = snapshot.get("kind").and_then(|v| v.as_str()).unwrap_or(""); + if kind == "drained" || was_empty { + break; + } + + // Continue feeding the SDK its own next-token. + continuation = Some(ContinuationToken::from_string(raw_next)); + + assert!( + page_count <= expected.len() + 2, + "fetched more pages ({page_count}) than expected ({})", + expected.len() + ); + } + + assert_eq!(expected, actual); + Ok(()) + }, + None, + ) + .await +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 5955f650853..bbf6c78961a 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -11,8 +11,8 @@ use crate::{ cache::{PartitionKeyRangeCache, PkRangeFetchResult}, dataflow::{ planner, query_plan::QueryPlan, CachedTopologyProvider, OperationPlan, - PartitionRoutingRefresh, PipelineContext, RequestExecutor, RequestTarget, - ResolvedRange, TopologyProvider, + PartitionRoutingRefresh, PipelineContext, PipelineNodeState, RequestExecutor, + RequestTarget, ResolvedRange, TopologyProvider, }, pipeline::operation_pipeline::OperationOverrides, routing::{ @@ -23,8 +23,9 @@ use crate::{ }, models::{ effective_partition_key::EffectivePartitionKey, AccountEndpoint, AccountReference, - ActivityId, ContainerProperties, ContainerReference, CosmosOperation, DatabaseProperties, - DatabaseReference, OperationTarget, PartitionKey, ResourceType, + ActivityId, ContainerProperties, ContainerReference, ContinuationToken, CosmosOperation, + DatabaseProperties, DatabaseReference, OperationTarget, PartitionKey, ResolvedToken, + ResourceType, }, options::{ ConnectionPoolOptions, DiagnosticsOptions, DriverOptions, OperationOptions, @@ -1251,7 +1252,7 @@ impl CosmosDriver { operation: CosmosOperation, options: OperationOptions, ) -> azure_core::Result> { - let mut plan = self.plan_operation(&operation, &options).await?; + let mut plan = self.plan_operation(&operation, &options, None).await?; self.execute_plan(&mut plan, operation.container().cloned(), options) .await } @@ -1548,10 +1549,21 @@ impl CosmosDriver { /// For trivial operations (non-query or single-partition), returns a /// singleton pipeline immediately. For cross-partition queries, fetches a /// query plan from the backend and builds a fan-out pipeline. + /// + /// `continuation` optionally provides resume state from a prior call. Two + /// kinds of tokens are accepted: + /// + /// - SDK-issued tokens (`c1.…`) carry a serialized snapshot of the + /// previous pipeline's state and can resume any operation. + /// - Opaque server-issued tokens (no `c.` prefix) are accepted only + /// for trivial operations; passing one to a cross-partition query + /// returns a [`DataConversion`](azure_core::error::ErrorKind::DataConversion) + /// error. pub async fn plan_operation( &self, operation: &CosmosOperation, options: &OperationOptions, + continuation: Option<&ContinuationToken>, ) -> azure_core::Result { if !self.initialized.load(Ordering::Acquire) { let endpoint = AccountEndpoint::from(self.options.account()); @@ -1566,9 +1578,31 @@ impl CosmosDriver { tracing::debug!(operation_type = ?operation.operation_type(), resource_type = ?operation.resource_type(), resource_reference = ?operation.resource_reference(), "planning operation"); + // Resolve the continuation token (if any) into a planner-ready resume + // state. Server-issued tokens are only valid for trivial operations. + let resume_state = match continuation { + None => None, + Some(token) => match token.resolve()? { + ResolvedToken::ClientV1(state) => Some(state), + ResolvedToken::ServerOpaque(server_token) => { + if !operation.is_trivial() { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + "an opaque server continuation token cannot be used to resume a \ + cross-partition query; use the SDK-issued continuation token from \ + FeedPageIterator::to_continuation_token()", + )); + } + Some(PipelineNodeState::Request { + server_continuation: Some(server_token), + }) + } + }, + }; + // Trivial plan: anything that isn't a cross-partition query. if operation.is_trivial() { - let pipeline = planner::build_trivial_pipeline(operation)?; + let pipeline = planner::build_trivial_pipeline(operation, resume_state)?; return Ok(OperationPlan::new(pipeline)); } @@ -1606,7 +1640,8 @@ impl CosmosDriver { |container, continuation| self.fetch_pk_ranges_from_service(container, continuation), ); - let pipeline = planner::build_sequential_drain(&query_plan, &mut topology, operation).await?; + let pipeline = + planner::build_sequential_drain(&query_plan, &mut topology, operation, resume_state).await?; Ok(OperationPlan::new(pipeline)) } @@ -2437,7 +2472,7 @@ mod tests { assert_send(driver.execute_operation(todo!(), todo!())); assert_send(driver.execute_point_operation(todo!(), todo!())); assert_send(driver.execute_plan(todo!(), todo!(), todo!())); - assert_send(driver.plan_operation(todo!(), todo!())); + assert_send(driver.plan_operation(todo!(), todo!(), todo!())); } // Account properties with two readable locations for regional fallback tests. diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs index 9c652fe5001..ffb183403cc 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs @@ -12,7 +12,9 @@ use std::collections::VecDeque; use async_trait::async_trait; -use super::{ChildNodes, PageResult, PipelineContext, PipelineNode}; +use crate::models::FeedRange; + +use super::{ChildNodes, PageResult, PipelineContext, PipelineNode, PipelineNodeState}; /// Maximum number of consecutive split retries before giving up. /// @@ -54,7 +56,26 @@ impl PipelineNode for SequentialDrain { }; match current.next_page(context).await? { - PageResult::Page(response) => return Ok(PageResult::Page(response)), + PageResult::Page { + response, + is_terminal, + } => { + if is_terminal { + // The front child has emitted its last page; evict it + // now so a snapshot taken after this call no longer + // references it. The drain itself is terminal only + // when this was its last child. + self.children.pop_front(); + return Ok(PageResult::Page { + response, + is_terminal: self.children.is_empty(), + }); + } + return Ok(PageResult::Page { + response, + is_terminal: false, + }); + } PageResult::Drained => { self.children.pop_front(); // Loop to try the next child. @@ -96,18 +117,44 @@ impl PipelineNode for SequentialDrain { fn into_children(self) -> Vec> { self.children.into_iter().collect() } + + fn snapshot_state(&self) -> PipelineNodeState { + let Some(front) = self.children.front() else { + return PipelineNodeState::Drained; + }; + let Some(range) = front.feed_range() else { + // Shouldn't happen for an EPK-ordered drain, but degrade gracefully: + // serialize the child snapshot directly with no cursor. + return front.snapshot_state(); + }; + PipelineNodeState::SequentialDrain { + current_min_epk: range.min_inclusive().as_str().to_string(), + left_most: Box::new(front.snapshot_state()), + } + } + + fn feed_range(&self) -> Option<&FeedRange> { + self.children.front().and_then(|c| c.feed_range()) + } } #[cfg(test)] mod tests { use super::*; use crate::driver::dataflow::mocks::*; + use crate::models::effective_partition_key::EffectivePartitionKey; #[tokio::test] async fn drains_single_child() { let child = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"p1"))), - Ok(PageResult::Page(response(b"p2"))), + Ok(PageResult::Page { + response: response(b"p1"), + is_terminal: false, + }), + Ok(PageResult::Page { + response: response(b"p2"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let mut drain = SequentialDrain::new(vec![Box::new(child)]); @@ -129,16 +176,28 @@ mod tests { #[tokio::test] async fn drains_multiple_children_in_order() { let child1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c1-p1"))), + Ok(PageResult::Page { + response: response(b"c1-p1"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let child2 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c2-p1"))), - Ok(PageResult::Page(response(b"c2-p2"))), + Ok(PageResult::Page { + response: response(b"c2-p1"), + is_terminal: false, + }), + Ok(PageResult::Page { + response: response(b"c2-p2"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let child3 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c3-p1"))), + Ok(PageResult::Page { + response: response(b"c3-p1"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let mut drain = @@ -194,11 +253,17 @@ mod tests { #[tokio::test] async fn handles_split_of_first_child() { let replacement1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"split-left"))), + Ok(PageResult::Page { + response: response(b"split-left"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let replacement2 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"split-right"))), + Ok(PageResult::Page { + response: response(b"split-right"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -207,7 +272,10 @@ mod tests { })]); let trailing_child = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"trailing"))), + Ok(PageResult::Page { + response: response(b"trailing"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -234,12 +302,18 @@ mod tests { #[tokio::test] async fn handles_split_of_middle_child() { let child1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c1"))), + Ok(PageResult::Page { + response: response(b"c1"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let replacement = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c2-split"))), + Ok(PageResult::Page { + response: response(b"c2-split"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { @@ -247,7 +321,10 @@ mod tests { })]); let child3 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c3"))), + Ok(PageResult::Page { + response: response(b"c3"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -278,12 +355,18 @@ mod tests { #[tokio::test] async fn handles_split_of_last_child() { let child1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c1"))), + Ok(PageResult::Page { + response: response(b"c1"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let replacement = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"last-split"))), + Ok(PageResult::Page { + response: response(b"last-split"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let split_child = MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { @@ -309,7 +392,10 @@ mod tests { #[tokio::test] async fn handles_cascading_split() { let final_leaf = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"final"))), + Ok(PageResult::Page { + response: response(b"final"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -335,9 +421,11 @@ mod tests { #[tokio::test] async fn split_retry_limit_prevents_infinite_loop() { - let mut current: Box = Box::new(MockLeaf::with_pages(vec![Ok( - PageResult::Page(response(b"unreachable")), - )])); + let mut current: Box = + Box::new(MockLeaf::with_pages(vec![Ok(PageResult::Page { + response: response(b"unreachable"), + is_terminal: false, + })])); for _ in 0..12 { current = Box::new(MockLeaf::with_pages(vec![Ok(PageResult::SplitRequired { @@ -361,7 +449,10 @@ mod tests { async fn child_drained_immediately_skips_to_next() { let empty_child = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); let real_child = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"data"))), + Ok(PageResult::Page { + response: response(b"data"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -380,15 +471,24 @@ mod tests { #[tokio::test] async fn split_with_three_way_replacement() { let r1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"r1"))), + Ok(PageResult::Page { + response: response(b"r1"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let r2 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"r2"))), + Ok(PageResult::Page { + response: response(b"r2"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let r3 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"r3"))), + Ok(PageResult::Page { + response: response(b"r3"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -419,7 +519,10 @@ mod tests { #[tokio::test] async fn error_after_partial_drain() { let child1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"ok"))), + Ok(PageResult::Page { + response: response(b"ok"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let child2 = MockLeaf::with_pages(vec![Err(azure_core::Error::with_message( @@ -443,13 +546,25 @@ mod tests { #[tokio::test] async fn multiple_pages_per_child_then_advance() { let child1 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c1-p1"))), - Ok(PageResult::Page(response(b"c1-p2"))), - Ok(PageResult::Page(response(b"c1-p3"))), + Ok(PageResult::Page { + response: response(b"c1-p1"), + is_terminal: false, + }), + Ok(PageResult::Page { + response: response(b"c1-p2"), + is_terminal: false, + }), + Ok(PageResult::Page { + response: response(b"c1-p3"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); let child2 = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"c2-p1"))), + Ok(PageResult::Page { + response: response(b"c2-p1"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -480,7 +595,10 @@ mod tests { #[tokio::test] async fn split_produces_page_on_same_call() { let replacement = MockLeaf::with_pages(vec![ - Ok(PageResult::Page(response(b"immediate"))), + Ok(PageResult::Page { + response: response(b"immediate"), + is_terminal: false, + }), Ok(PageResult::Drained), ]); @@ -509,4 +627,78 @@ mod tests { let drain = SequentialDrain::new(vec![Box::new(c1), Box::new(c2), Box::new(c3)]); assert_eq!(drain.children().len(), 3); } + + #[tokio::test] + async fn terminal_page_pops_child_eagerly() { + // The first child returns one terminal page; the drain must pop it + // immediately so a snapshot taken right after the call already + // points at the next child. + let child1 = MockLeaf::with_pages(vec![Ok(PageResult::Page { + response: response(b"c1-final"), + is_terminal: true, + })]) + .with_feed_range(FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("80"), + )); + let child2 = MockLeaf::with_pages(vec![ + Ok(PageResult::Page { + response: response(b"c2-p1"), + is_terminal: false, + }), + Ok(PageResult::Drained), + ]) + .with_feed_range(FeedRange::new( + EffectivePartitionKey::from("80"), + EffectivePartitionKey::from("FF"), + )); + + let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(child2)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + let page = unwrap_page(drain.next_page(&mut context).await); + assert_eq!(page.body(), b"c1-final"); + + // Snapshot must already reference child2 (cursor at "80"), not the + // just-drained child1. + let snapshot = drain.snapshot_state(); + let PipelineNodeState::SequentialDrain { + current_min_epk, .. + } = snapshot + else { + panic!("expected SequentialDrain snapshot, got {snapshot:?}"); + }; + assert_eq!(current_min_epk, "80"); + } + + #[tokio::test] + async fn terminal_page_on_last_child_marks_drain_terminal() { + let only_child = MockLeaf::with_pages(vec![Ok(PageResult::Page { + response: response(b"final"), + is_terminal: true, + })]) + .with_feed_range(FeedRange::new( + EffectivePartitionKey::from("00"), + EffectivePartitionKey::from("FF"), + )); + + let mut drain = SequentialDrain::new(vec![Box::new(only_child)]); + let mut executor = NoopRequestExecutor; + let mut topology = NoopTopologyProvider; + let mut context = PipelineContext::new(&mut executor, &mut topology); + + match drain.next_page(&mut context).await.unwrap() { + PageResult::Page { + response, + is_terminal, + } => { + assert_eq!(response.body(), b"final"); + assert!(is_terminal, "drain must propagate terminal flag"); + } + other => panic!("expected Page, got {other:?}"), + } + assert!(matches!(drain.snapshot_state(), PipelineNodeState::Drained)); + } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs new file mode 100644 index 00000000000..e52a36c02b4 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! A trivial leaf node that immediately reports `Drained`. +//! +//! Used when reconstructing a pipeline from a continuation token whose +//! [`PipelineNodeState::Drained`](super::PipelineNodeState) snapshot indicates +//! the operation already completed. Allows the SDK iterator to behave +//! uniformly without the planner having to special-case the "already done" +//! state. + +use async_trait::async_trait; + +use super::{ChildNodes, PageResult, PipelineContext, PipelineNode, PipelineNodeState}; + +pub(crate) struct DrainedLeaf; + +#[async_trait] +impl PipelineNode for DrainedLeaf { + async fn next_page( + &mut self, + _context: &mut PipelineContext<'_>, + ) -> azure_core::Result { + Ok(PageResult::Drained) + } + + fn children(&self) -> ChildNodes<'_> { + ChildNodes::None + } + + fn into_children(self) -> Vec> { + Vec::new() + } + + fn snapshot_state(&self) -> PipelineNodeState { + PipelineNodeState::Drained + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs index 74d401b4830..5d4e866fe01 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs @@ -10,7 +10,7 @@ use futures::future::BoxFuture; use super::{ ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, - RequestExecutor, RequestTarget, ResolvedRange, TopologyProvider, + PipelineNodeState, RequestExecutor, RequestTarget, ResolvedRange, TopologyProvider, }; use crate::{ diagnostics::DiagnosticsContextBuilder, @@ -27,6 +27,7 @@ use crate::{ /// A mock leaf node that returns pre-configured page results. pub(crate) struct MockLeaf { pages: VecDeque>, + feed_range: Option, } impl MockLeaf { @@ -34,8 +35,16 @@ impl MockLeaf { pub fn with_pages(pages: Vec>) -> Self { Self { pages: pages.into(), + feed_range: None, } } + + /// Sets the feed range reported by [`PipelineNode::feed_range`]. + #[allow(dead_code)] + pub fn with_feed_range(mut self, range: FeedRange) -> Self { + self.feed_range = Some(range); + self + } } #[async_trait::async_trait] @@ -56,6 +65,14 @@ impl PipelineNode for MockLeaf { fn into_children(self) -> Vec> { vec![] } + + fn snapshot_state(&self) -> PipelineNodeState { + PipelineNodeState::Drained + } + + fn feed_range(&self) -> Option<&FeedRange> { + self.feed_range.as_ref() + } } // ── Request executors ─────────────────────────────────────────────────────── @@ -164,7 +181,7 @@ impl TopologyProvider for MockTopologyProvider { /// Extracts the `CosmosResponse` from a `PageResult::Page`, panicking otherwise. pub(crate) fn unwrap_page(result: azure_core::Result) -> CosmosResponse { match result.expect("expected Ok result") { - PageResult::Page(r) => r, + PageResult::Page { response, .. } => response, PageResult::Drained => panic!("expected Page, got Drained"), PageResult::SplitRequired { .. } => panic!("expected Page, got SplitRequired"), } @@ -174,7 +191,7 @@ pub(crate) fn unwrap_page(result: azure_core::Result) -> CosmosRespo pub(crate) fn assert_drained(result: azure_core::Result) { match result.expect("expected Ok result") { PageResult::Drained => {} - PageResult::Page(_) => panic!("expected Drained, got Page"), + PageResult::Page { .. } => panic!("expected Drained, got Page"), PageResult::SplitRequired { .. } => panic!("expected Drained, got SplitRequired"), } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs index 49a5a3d872b..61044965bd8 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -2,23 +2,54 @@ // Licensed under the MIT License. //! Dataflow pipeline nodes for paged Cosmos DB operations. +//! +//! Everything in this module is driver-internal except [`OperationPlan`], +//! which is the only type re-exported to public APIs. The rest is the +//! machinery `CosmosDriver` uses to plan, execute, and resume paged +//! operations. +//! +//! # Navigation map +//! +//! - Leaf nodes: [`Request`] (executes a single Cosmos DB request and pages +//! through continuation tokens) and [`DrainedLeaf`] (a no-op leaf used when +//! resuming an already-completed plan). +//! - Intermediate nodes: [`SequentialDrain`] iterates EPK-ordered children +//! left-to-right, draining each before advancing. +//! - Planner: [`planner::build_trivial_pipeline`] handles point reads and +//! single-partition operations; [`planner::build_sequential_drain`] handles +//! cross-partition queries by consuming a backend query plan and resolving +//! it against the current topology. +//! - Serializable state: [`PipelineNodeState`] (see [`snapshot`]) is the +//! in-memory shape of a continuation snapshot; the wire-format token lives +//! in [`crate::models::ContinuationToken`]. +//! - Topology adapter: [`CachedTopologyProvider`] backs the +//! [`TopologyProvider`] trait with the driver's +//! [`PartitionKeyRangeCache`](crate::driver::cache::PartitionKeyRangeCache). +//! +//! See `FEED_OPERATIONS_REQS.md` for the design intent behind the dataflow +//! pipeline (paged operations, split recovery, continuation tokens, planned +//! cross-partition strategies). mod drain; +mod drained; #[cfg(test)] pub(crate) mod mocks; pub(crate) mod planner; pub(crate) mod query_plan; mod request; +mod snapshot; mod topology; use std::ops::Index; use futures::future::BoxFuture; -use crate::models::{CosmosOperation, CosmosResponse, FeedRange}; +use crate::models::{ContinuationToken, CosmosOperation, CosmosResponse, FeedRange}; pub(crate) use drain::SequentialDrain; +pub(crate) use drained::DrainedLeaf; pub(crate) use request::{Request, RequestTarget}; +pub(crate) use snapshot::PipelineNodeState; pub(crate) use topology::CachedTopologyProvider; /// Request execution mode for partition routing metadata. @@ -120,7 +151,17 @@ impl<'a> PipelineContext<'a> { #[allow(clippy::large_enum_variant)] pub(crate) enum PageResult { /// A page of results was produced. - Page(CosmosResponse), + /// + /// `is_terminal` is `true` when this node has no more pages to emit + /// after this one — set by leaf nodes when the server returned no + /// continuation token, and propagated by intermediate nodes when their + /// last child has emitted its terminal page. Parents use this to evict + /// drained children eagerly so that snapshots of the pipeline do not + /// include children that are already done. + Page { + response: CosmosResponse, + is_terminal: bool, + }, /// This node has no more pages to emit. Drained, /// This node's EPK range has split and needs to be replaced by new child nodes. @@ -139,7 +180,9 @@ pub(crate) enum PageResult { impl std::fmt::Debug for PageResult { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - PageResult::Page(_) => f.write_str("Page(...)"), + PageResult::Page { is_terminal, .. } => { + write!(f, "Page(terminal={is_terminal})") + } PageResult::Drained => f.write_str("Drained"), PageResult::SplitRequired { replacement_nodes, .. @@ -226,6 +269,28 @@ pub(crate) trait PipelineNode: Send + std::any::Any { /// Consumes this node and returns its children as a `Vec`. fn into_children(self) -> Vec>; + + /// Snapshots this node's state for continuation-token serialization. + fn snapshot_state(&self) -> PipelineNodeState; + + /// Returns the EPK range this node currently targets, if known. + /// + /// Used by intermediate nodes (e.g. [`SequentialDrain`]) to record the + /// current cursor position when snapshotting, without needing to know + /// the concrete type of their children. Defaults to `None`. + /// + /// # Invariant + /// + /// Every node in the dataflow tree is responsible for some contiguous EPK + /// sub-range of the container key space. Intermediate nodes that drain + /// children in EPK order (such as [`SequentialDrain`]) may use the front + /// child's `feed_range()` as their own cursor; intermediates that combine + /// results across ranges (e.g. a future k-way merge for streaming + /// `ORDER BY`) are responsible for snapshotting whatever cursor + /// representation makes sense for their ordering semantics. + fn feed_range(&self) -> Option<&FeedRange> { + None + } } impl dyn PipelineNode { @@ -275,7 +340,7 @@ impl Pipeline { context: &mut PipelineContext<'_>, ) -> azure_core::Result> { match self.root.next_page(context).await? { - PageResult::Page(response) => Ok(Some(response)), + PageResult::Page { response, .. } => Ok(Some(response)), PageResult::Drained => Ok(None), // Defensive: today the root is always a `Request`, `SequentialDrain`, // or `DrainedLeaf`, none of which can bubble `SplitRequired` up past @@ -287,6 +352,11 @@ impl Pipeline { )), } } + + /// Snapshots the pipeline's current state for continuation-token serialization. + pub(crate) fn snapshot_state(&self) -> PipelineNodeState { + self.root.snapshot_state() + } } /// An opaque plan for executing a Cosmos DB operation. @@ -302,6 +372,17 @@ impl OperationPlan { pub(crate) fn new(pipeline: Pipeline) -> Self { Self { pipeline } } + + /// Snapshots this plan into a [`ContinuationToken`] suitable for cross-process + /// resumption. + /// + /// Snapshotting walks the pipeline tree and serializes a minimal record of + /// each node's progress. The result can be passed back to + /// [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation) + /// (with the same operation) to resume where this plan left off. + pub fn to_continuation_token(&self) -> azure_core::Result { + ContinuationToken::encode_v1(&self.pipeline.snapshot_state()) + } } #[cfg(test)] @@ -311,9 +392,11 @@ mod tests { #[tokio::test] async fn pipeline_forwards_pages_from_root() { - let mut pipeline = Pipeline::new(Box::new(MockLeaf::with_pages(vec![Ok( - PageResult::Page(response(b"page")), - )]))); + let mut pipeline = + Pipeline::new(Box::new(MockLeaf::with_pages(vec![Ok(PageResult::Page { + response: response(b"page"), + is_terminal: false, + })]))); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; let mut context = PipelineContext::new(&mut executor, &mut topology); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs index 529c6948c06..617ff63572e 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs @@ -16,8 +16,8 @@ use crate::models::{ use super::{ query_plan::{QueryInfo, QueryPlan}, - PartitionRoutingRefresh, Pipeline, PipelineNode, Request, RequestTarget, SequentialDrain, - TopologyProvider, + DrainedLeaf, PartitionRoutingRefresh, Pipeline, PipelineNode, PipelineNodeState, Request, + RequestTarget, SequentialDrain, TopologyProvider, }; /// Builds a single-node [`Pipeline`] for a trivial operation. @@ -26,12 +26,19 @@ use super::{ /// one partition (point reads, single-partition queries, metadata operations). /// Use [`CosmosOperation::is_trivial`] to check eligibility before calling. /// +/// `resume` is an optional [`PipelineNodeState`] from a continuation token +/// that augments planning. Only `Request` and `Drained` shapes are accepted +/// for trivial operations; any other shape returns a `DataConversion` error. +/// /// # Panics (debug builds) /// /// Debug-asserts that the operation is indeed trivial. In release builds, /// returns an error if a non-trivial operation (e.g. a cross-partition query) /// is passed. -pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core::Result { +pub(crate) fn build_trivial_pipeline( + operation: &CosmosOperation, + resume: Option, +) -> azure_core::Result { debug_assert!( operation.is_trivial(), "build_trivial_pipeline called with non-trivial operation: {:?} targeting {:?}", @@ -52,6 +59,25 @@ pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core: ))?; } + let initial_continuation = match resume { + None => None, + Some(PipelineNodeState::Request { + server_continuation, + }) => server_continuation, + Some(PipelineNodeState::Drained) => { + return Ok(Pipeline::new(Box::new(DrainedLeaf))); + } + Some(other) => { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!( + "continuation token shape {} does not match a trivial operation", + snapshot_kind(&other) + ), + )); + } + }; + let request_target = match target { OperationTarget::None => RequestTarget::NonPartitioned, OperationTarget::PartitionKey(pk) => RequestTarget::LogicalPartitionKey(pk.clone()), @@ -64,7 +90,7 @@ pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core: } }; - let root = Request::new(operation.clone(), request_target, None); + let root = Request::new(operation.clone(), request_target, initial_continuation); Ok(Pipeline::new(Box::new(root))) } @@ -82,15 +108,63 @@ pub(crate) fn build_trivial_pipeline(operation: &CosmosOperation) -> azure_core: /// against the current partition topology. /// 3. Creates a [`Request`] node for each resolved range and bundles them in a /// [`SequentialDrain`]. +/// +/// `resume` is an optional [`PipelineNodeState`] from a continuation token. +/// When present, ranges whose `max_exclusive <= current_min_epk` are skipped +/// and the server continuation from `left_most` is propagated to the front +/// (resumed) leaf only. pub(crate) async fn build_sequential_drain( query_plan: &QueryPlan, topology_provider: &mut dyn TopologyProvider, operation: &CosmosOperation, + resume: Option, ) -> azure_core::Result { validate_query_plan(query_plan)?; + let resume = match resume { + None => None, + Some(PipelineNodeState::Drained) => { + return Ok(Pipeline::new(Box::new(DrainedLeaf))); + } + Some(PipelineNodeState::SequentialDrain { + current_min_epk, + left_most, + }) => { + let server_continuation = match *left_most { + PipelineNodeState::Request { + server_continuation, + } => server_continuation, + PipelineNodeState::Drained => None, + other => { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!( + "continuation token has unsupported nested shape inside SequentialDrain: {}", + snapshot_kind(&other) + ), + )); + } + }; + Some(ResumeCursor { + current_min_epk: EffectivePartitionKey::from(current_min_epk), + server_continuation, + }) + } + Some(PipelineNodeState::Request { + server_continuation, + }) => { + // A bare Request snapshot means the cross-partition query had only + // a single child — apply it as a cursor at the minimum EPK. + Some(ResumeCursor { + current_min_epk: EffectivePartitionKey::min(), + server_continuation, + }) + } + }; + // Convert query ranges to FeedRanges and resolve against topology. let mut request_nodes: Vec> = Vec::new(); + let mut resume = resume; for query_range in &query_plan.query_ranges { let min = EffectivePartitionKey::from(query_range.min.as_str()); let max = EffectivePartitionKey::from(query_range.max.as_str()); @@ -100,17 +174,36 @@ pub(crate) async fn build_sequential_drain( .await?; for resolved_range in resolved { + // Skip ranges that are entirely below the resume cursor. + if let Some(cursor) = resume.as_ref() { + if resolved_range.range.max_exclusive() <= &cursor.current_min_epk { + continue; + } + } + + // Carry the server continuation onto the first surviving leaf, + // then clear it so subsequent leaves start fresh. + let initial_continuation = resume.as_mut().and_then(|c| c.server_continuation.take()); let target = RequestTarget::EffectivePartitionKeyRange { range: resolved_range.range, partition_key_range_id: resolved_range.partition_key_range_id, }; - request_nodes.push(Box::new(Request::new(operation.clone(), target, None))); + request_nodes.push(Box::new(Request::new( + operation.clone(), + target, + initial_continuation, + ))); } } // TODO: enforce max fan-out (default 100, configurable). See FEED_OPERATIONS_REQS.md §3. if request_nodes.is_empty() { + // Either the plan had no ranges or everything was below the cursor. + // The latter is a normal "fully drained" outcome — emit a drained leaf. + if resume.is_some() { + return Ok(Pipeline::new(Box::new(DrainedLeaf))); + } return Err(azure_core::Error::with_message( azure_core::error::ErrorKind::Other, "query plan produced no partition ranges to query", @@ -126,6 +219,20 @@ pub(crate) async fn build_sequential_drain( Ok(Pipeline::new(root)) } +/// Resume cursor extracted from a `SequentialDrain` continuation snapshot. +struct ResumeCursor { + current_min_epk: EffectivePartitionKey, + server_continuation: Option, +} + +fn snapshot_kind(state: &PipelineNodeState) -> &'static str { + match state { + PipelineNodeState::Drained => "Drained", + PipelineNodeState::Request { .. } => "Request", + PipelineNodeState::SequentialDrain { .. } => "SequentialDrain", + } +} + /// Validates that the query plan does not require features we don't yet support. fn validate_query_plan(plan: &QueryPlan) -> azure_core::Result<()> { if plan.hybrid_search_query_info.is_some() { @@ -238,7 +345,7 @@ mod tests { #[test] fn plans_non_partitioned_pipeline_for_database_read() { let op = CosmosOperation::read_database(test_database()); - let pipeline = build_trivial_pipeline(&op).unwrap(); + let pipeline = build_trivial_pipeline(&op, None).unwrap(); let request = pipeline.root().downcast_ref::().unwrap(); assert_eq!(*request.target(), RequestTarget::NonPartitioned); @@ -251,7 +358,7 @@ mod tests { let pk = PartitionKey::from("pk-value"); let item = ItemReference::from_name(&test_container(), pk.clone(), "doc1"); let op = CosmosOperation::read_item(item); - let pipeline = build_trivial_pipeline(&op).unwrap(); + let pipeline = build_trivial_pipeline(&op, None).unwrap(); let request = pipeline.root().downcast_ref::().unwrap(); assert_eq!( @@ -268,7 +375,7 @@ mod tests { // In debug builds, this panics via debug_assert; in release builds it returns Err. let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - build_trivial_pipeline(&op) + build_trivial_pipeline(&op, None) })); match result { @@ -383,7 +490,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-0")])]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_single_request(&pipeline, "", "FF", "pkrange-0"); } @@ -397,7 +506,9 @@ mod tests { rr("80", "FF", "pkrange-right"), ])]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_drain_requests( pipeline, &[("", "80", "pkrange-left"), ("80", "FF", "pkrange-right")], @@ -414,7 +525,9 @@ mod tests { Ok(vec![rr("80", "FF", "pkrange-C")]), ]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_drain_requests( pipeline, &[("", "40", "pkrange-A"), ("80", "FF", "pkrange-C")], @@ -432,7 +545,9 @@ mod tests { rr("80", "C0", "pkrange-3"), ])]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_drain_requests( pipeline, &[ @@ -462,7 +577,9 @@ mod tests { ]), ]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_drain_requests( pipeline, &[ @@ -482,7 +599,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-wide")])]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_single_request(&pipeline, "", "FF", "pkrange-wide"); } @@ -498,7 +617,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "unsupported query feature: TOP clause in cross-partition queries" @@ -517,7 +638,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "unsupported query feature: LIMIT clause in cross-partition queries" @@ -537,7 +660,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "unsupported query feature: ORDER BY in cross-partition queries" @@ -556,7 +681,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "unsupported query feature: aggregates in cross-partition queries" @@ -575,7 +702,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "unsupported query feature: GROUP BY in cross-partition queries" @@ -598,7 +727,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "unsupported query feature: hybrid search queries" @@ -611,7 +742,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-0")])]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op).await.unwrap(); + let pipeline = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap(); assert_single_request(&pipeline, "", "FF", "pkrange-0"); } @@ -621,7 +754,9 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = NoopTopologyProvider; - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!( err.to_string(), "query plan produced no partition ranges to query" @@ -637,7 +772,133 @@ mod tests { "topology resolution failed", ))]); - let err = build_sequential_drain(&plan, &mut topology, &op).await.unwrap_err(); + let err = build_sequential_drain(&plan, &mut topology, &op, None) + .await + .unwrap_err(); assert_eq!(err.to_string(), "topology resolution failed"); } + + // ----------------------------------------------------------------- + // Resume tests + // ----------------------------------------------------------------- + + #[tokio::test] + async fn resume_drained_state_yields_drained_pipeline() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-0")])]); + + let pipeline = build_sequential_drain(&plan, &mut topology, &op, Some(PipelineNodeState::Drained)) + .await + .unwrap(); + + // The drained pipeline immediately yields no pages. + assert!(matches!( + pipeline.snapshot_state(), + PipelineNodeState::Drained + )); + } + + #[tokio::test] + async fn resume_skips_ranges_below_cursor() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + rr("", "55", "pk-a"), + rr("55", "AA", "pk-b"), + rr("AA", "FF", "pk-c"), + ])]); + + // Cursor sitting at the first byte of the second range — the first + // range (max_exclusive == "55") must be skipped, the others kept. + let resume = PipelineNodeState::SequentialDrain { + current_min_epk: "55".to_owned(), + left_most: Box::new(PipelineNodeState::Request { + server_continuation: None, + }), + }; + + let pipeline = build_sequential_drain(&plan, &mut topology, &op, Some(resume)) + .await + .unwrap(); + assert_drain_requests(pipeline, &[("55", "AA", "pk-b"), ("AA", "FF", "pk-c")]); + } + + #[tokio::test] + async fn resume_propagates_server_continuation_to_first_surviving_leaf_only() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![ + rr("", "55", "pk-a"), + rr("55", "AA", "pk-b"), + rr("AA", "FF", "pk-c"), + ])]); + + let resume = PipelineNodeState::SequentialDrain { + current_min_epk: "55".to_owned(), + left_most: Box::new(PipelineNodeState::Request { + server_continuation: Some("server-token-xyz".to_owned()), + }), + }; + + let pipeline = build_sequential_drain(&plan, &mut topology, &op, Some(resume)) + .await + .unwrap(); + let snapshot = pipeline.snapshot_state(); + let PipelineNodeState::SequentialDrain { left_most, .. } = snapshot else { + panic!("expected SequentialDrain snapshot, got {snapshot:?}"); + }; + assert_eq!( + *left_most, + PipelineNodeState::Request { + server_continuation: Some("server-token-xyz".to_owned()), + }, + "front leaf must carry the resumed server continuation", + ); + } + + #[tokio::test] + async fn resume_with_cursor_past_all_ranges_yields_drained_pipeline() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "55", "pk-a")])]); + + let resume = PipelineNodeState::SequentialDrain { + current_min_epk: "FF".to_owned(), + left_most: Box::new(PipelineNodeState::Drained), + }; + + let pipeline = build_sequential_drain(&plan, &mut topology, &op, Some(resume)) + .await + .unwrap(); + assert!(matches!( + pipeline.snapshot_state(), + PipelineNodeState::Drained + )); + } + + #[tokio::test] + async fn resume_rejects_nested_sequential_drain_inside_left_most() { + let plan = plan_with_ranges(vec![qr("", "FF")]); + let op = cross_partition_query_operation(); + let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pk-a")])]); + + let resume = PipelineNodeState::SequentialDrain { + current_min_epk: "00".to_owned(), + left_most: Box::new(PipelineNodeState::SequentialDrain { + current_min_epk: "00".to_owned(), + left_most: Box::new(PipelineNodeState::Request { + server_continuation: None, + }), + }), + }; + + let err = build_sequential_drain(&plan, &mut topology, &op, Some(resume)) + .await + .unwrap_err(); + assert!( + err.to_string().contains("unsupported nested shape"), + "unexpected error message: {err}", + ); + } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index 0afa1e0bf45..87c826f5b00 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -6,10 +6,13 @@ use async_trait::async_trait; use azure_core::http::StatusCode; -use crate::models::{CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode}; +use crate::models::{ + CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode, +}; use super::{ - ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, ResolvedRange, + ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, + PipelineNodeState, ResolvedRange, }; /// The target of a request node. @@ -166,6 +169,22 @@ impl PipelineNode for Request { fn into_children(self) -> Vec> { Vec::new() } + + fn snapshot_state(&self) -> PipelineNodeState { + match &self.state { + RequestState::Initial => PipelineNodeState::Request { + server_continuation: None, + }, + RequestState::Continuing { continuation } => PipelineNodeState::Request { + server_continuation: Some(continuation.clone()), + }, + RequestState::Drained => PipelineNodeState::Drained, + } + } + + fn feed_range(&self) -> Option<&FeedRange> { + self.target.owned_range() + } } impl Request { fn handle_response(&mut self, response: CosmosResponse) -> PageResult { @@ -184,7 +203,11 @@ impl Request { RequestState::Drained }; tracing::trace!(target = ?self.target, state = ?self.state, "updated request state after response"); - PageResult::Page(response) + let is_terminal = matches!(self.state, RequestState::Drained); + PageResult::Page { + response, + is_terminal, + } } async fn handle_partition_topology_change( diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/snapshot.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/snapshot.rs new file mode 100644 index 00000000000..15fdc13739f --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/snapshot.rs @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Pipeline node snapshot state used to serialize / deserialize continuation +//! tokens. +//! +//! Each variant captures only the information required to reconstruct an +//! equivalent pipeline on resume. In particular, [`SequentialDrain`] only +//! preserves its left-most child plus an EPK floor; the planner reconstructs +//! the remaining (yet-to-drain) children from the operation's query ranges +//! and the current topology. + +use serde::{Deserialize, Serialize}; + +/// Serializable snapshot of a [`PipelineNode`](super::PipelineNode) subtree. +/// +/// The shape is intentionally open to future intermediate node kinds so a +/// parent does not need to know what type its child is — every node produces +/// a `PipelineNodeState` from `snapshot_state()`. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub(crate) enum PipelineNodeState { + /// The node has produced all of its pages. + Drained, + + /// A leaf request node. + /// + /// `server_continuation` is the opaque page token returned by the server + /// for the next page, or `None` when no request has yet been issued. + Request { + #[serde(default, skip_serializing_if = "Option::is_none")] + server_continuation: Option, + }, + + /// A sequential drain over EPK-ordered children. + /// + /// Only the left-most (currently-active) child's snapshot is preserved. + /// `current_min_epk` is the minimum EPK still left to drain; the planner + /// uses it to skip ranges that are entirely below the cursor on resume. + SequentialDrain { + current_min_epk: String, + left_most: Box, + }, +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs new file mode 100644 index 00000000000..9bc0c0766eb --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs @@ -0,0 +1,323 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Continuation token type for resumable Cosmos DB feed operations. +//! +//! A [`ContinuationToken`] is an opaque, durable representation of where a +//! feed operation left off. Tokens are produced by the SDK from a live +//! [`OperationPlan`](crate::OperationPlan) and consumed by +//! [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation) +//! to build an equivalent pipeline that resumes at the same position. +//! +//! # Token format +//! +//! SDK-issued tokens start with a version prefix `c.` followed by a +//! base64url-no-pad encoded JSON document. The current version is `c1.`. +//! Tokens with a `c.` prefix where `N > 1` are returned by newer SDKs and +//! are rejected with a clear error. +//! +//! Tokens without a `c.` prefix are treated as opaque server-issued +//! continuation strings and are only valid for trivial operations +//! (single-partition or non-query operations) where the SDK can pass them +//! through unmodified. + +use base64::Engine; +use serde::{Deserialize, Serialize}; + +use crate::driver::dataflow::PipelineNodeState; + +/// Current SDK token version prefix. +const SDK_V1_PREFIX: &str = "c1."; + +/// Opaque continuation token for resuming a paginated Cosmos DB operation. +/// +/// Construct one from a string returned by an earlier query (either the +/// SDK's `to_continuation_token()` output, or — for trivial operations — a +/// raw server-side continuation string). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ContinuationToken(String); + +impl ContinuationToken { + /// Wraps an opaque continuation string. + /// + /// No validation is performed here; the string is validated when it is + /// passed to + /// [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation). + pub fn from_string(token: String) -> Self { + Self(token) + } + + /// Returns the underlying string form of this token. + pub fn as_str(&self) -> &str { + &self.0 + } + + /// Encodes a [`PipelineNodeState`] as a `c1.`-prefixed token. + pub(crate) fn encode_v1(state: &PipelineNodeState) -> azure_core::Result { + let json = serde_json::to_vec(state).map_err(|e| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!("failed to serialize continuation token state: {e}"), + ) + })?; + let body = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(json); + let mut out = String::with_capacity(SDK_V1_PREFIX.len() + body.len()); + out.push_str(SDK_V1_PREFIX); + out.push_str(&body); + Ok(Self(out)) + } + + /// Resolves this token into a planner-ready form. + pub(crate) fn resolve(&self) -> azure_core::Result { + if let Some(rest) = self.0.strip_prefix(SDK_V1_PREFIX) { + let json = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(rest) + .map_err(|e| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!("continuation token has invalid base64 payload: {e}"), + ) + })?; + let state: PipelineNodeState = serde_json::from_slice(&json).map_err(|e| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!("continuation token has invalid JSON payload: {e}"), + ) + })?; + return Ok(ResolvedToken::ClientV1(state)); + } + + if let Some(version) = parse_client_version_prefix(&self.0) { + return Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::DataConversion, + format!( + "continuation token uses unsupported version 'c{version}.'; \ + this SDK only understands 'c1.' tokens — upgrade to a newer SDK" + ), + )); + } + + // No client-version prefix: treat as an opaque server-issued token. + Ok(ResolvedToken::ServerOpaque(self.0.clone())) + } +} + +/// Resolved form of a [`ContinuationToken`] for use during planning. +pub(crate) enum ResolvedToken { + /// A client-issued v1 token containing a snapshot of pipeline state. + ClientV1(PipelineNodeState), + + /// An opaque server continuation string. Only valid for trivial operations. + ServerOpaque(String), +} + +/// Returns `Some(N)` if `s` starts with `c.` for some unsigned integer `N`, +/// otherwise `None`. +/// +/// The `c.` prefix is a deliberate, reserved namespace for SDK-issued +/// tokens (where `N` is the SDK's continuation-token format version). +/// Server-issued opaque continuation tokens have never been observed to start +/// with this pattern, so the SDK treats any `c.` token as SDK-versioned and +/// anything else as a server opaque token. If the server format ever changes +/// to collide with `c.`, this is the place to revisit. +fn parse_client_version_prefix(s: &str) -> Option { + let after_c = s.strip_prefix('c')?; + let dot = after_c.find('.')?; + after_c[..dot].parse::().ok() +} + +// Allow direct serde of ContinuationToken as a string (e.g. for users storing +// it in a JSON document alongside other fields). +impl Serialize for ContinuationToken { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(&self.0) + } +} + +impl<'de> Deserialize<'de> for ContinuationToken { + fn deserialize>(deserializer: D) -> Result { + let s = String::deserialize(deserializer)?; + Ok(Self(s)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Decodes the base64url-no-pad payload of a `c1.`-prefixed token into + /// its raw JSON bytes for inspection. + fn decode_v1_payload(token: &ContinuationToken) -> String { + let body = token + .as_str() + .strip_prefix(SDK_V1_PREFIX) + .expect("token must be c1.-prefixed"); + let bytes = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(body) + .expect("payload must be valid base64url-no-pad"); + String::from_utf8(bytes).expect("payload must be valid UTF-8") + } + + /// Builds a `c1.` token whose payload is the given JSON string. + fn encode_v1_payload(json: &str) -> ContinuationToken { + let body = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(json); + ContinuationToken::from_string(format!("{SDK_V1_PREFIX}{body}")) + } + + // ── Serialization ─────────────────────────────────────────────────── + + #[test] + fn encode_v1_drained_state() { + let token = ContinuationToken::encode_v1(&PipelineNodeState::Drained).unwrap(); + assert_eq!(decode_v1_payload(&token), r#"{"kind":"drained"}"#); + } + + #[test] + fn encode_v1_request_state_omits_absent_server_continuation() { + let token = ContinuationToken::encode_v1(&PipelineNodeState::Request { + server_continuation: None, + }) + .unwrap(); + assert_eq!(decode_v1_payload(&token), r#"{"kind":"request"}"#); + } + + #[test] + fn encode_v1_request_state_includes_server_continuation() { + let token = ContinuationToken::encode_v1(&PipelineNodeState::Request { + server_continuation: Some("server-token-1".to_string()), + }) + .unwrap(); + assert_eq!( + decode_v1_payload(&token), + r#"{"kind":"request","server_continuation":"server-token-1"}"#, + ); + } + + #[test] + fn encode_v1_sequential_drain_state() { + let token = ContinuationToken::encode_v1(&PipelineNodeState::SequentialDrain { + current_min_epk: "3F".to_string(), + left_most: Box::new(PipelineNodeState::Request { + server_continuation: None, + }), + }) + .unwrap(); + assert_eq!( + decode_v1_payload(&token), + r#"{"kind":"sequential_drain","current_min_epk":"3F","left_most":{"kind":"request"}}"#, + ); + } + + // ── Deserialization ───────────────────────────────────────────────── + + #[test] + fn resolve_v1_drained_state() { + let token = encode_v1_payload(r#"{"kind":"drained"}"#); + match token.resolve().unwrap() { + ResolvedToken::ClientV1(state) => assert_eq!(state, PipelineNodeState::Drained), + other => panic!("expected ClientV1, got {other:?}"), + } + } + + #[test] + fn resolve_v1_request_state_with_server_continuation() { + let token = encode_v1_payload( + r#"{"kind":"request","server_continuation":"opaque-srv-token"}"#, + ); + match token.resolve().unwrap() { + ResolvedToken::ClientV1(state) => assert_eq!( + state, + PipelineNodeState::Request { + server_continuation: Some("opaque-srv-token".to_string()), + } + ), + other => panic!("expected ClientV1, got {other:?}"), + } + } + + #[test] + fn resolve_v1_request_state_without_server_continuation() { + let token = encode_v1_payload(r#"{"kind":"request"}"#); + match token.resolve().unwrap() { + ResolvedToken::ClientV1(state) => assert_eq!( + state, + PipelineNodeState::Request { + server_continuation: None, + } + ), + other => panic!("expected ClientV1, got {other:?}"), + } + } + + #[test] + fn resolve_v1_sequential_drain_state() { + let token = encode_v1_payload( + r#"{"kind":"sequential_drain","current_min_epk":"3F","left_most":{"kind":"request"}}"#, + ); + match token.resolve().unwrap() { + ResolvedToken::ClientV1(state) => assert_eq!( + state, + PipelineNodeState::SequentialDrain { + current_min_epk: "3F".to_string(), + left_most: Box::new(PipelineNodeState::Request { + server_continuation: None, + }), + } + ), + other => panic!("expected ClientV1, got {other:?}"), + } + } + + // ── Error and fallback paths ──────────────────────────────────────── + + #[test] + fn rejects_newer_sdk_token() { + let token = ContinuationToken::from_string("c2.somethingnew".to_string()); + let err = token.resolve().unwrap_err(); + assert!(matches!( + err.kind(), + azure_core::error::ErrorKind::DataConversion + )); + assert!(err.to_string().contains("c2.")); + } + + #[test] + fn server_opaque_token_when_no_prefix() { + let token = ContinuationToken::from_string("opaque-server-string".to_string()); + match token.resolve().unwrap() { + ResolvedToken::ServerOpaque(s) => assert_eq!(s, "opaque-server-string"), + other => panic!("expected ServerOpaque, got {other:?}"), + } + } + + #[test] + fn rejects_invalid_base64_in_v1_token() { + let token = ContinuationToken::from_string("c1.!!!notvalid!!!".to_string()); + let err = token.resolve().unwrap_err(); + assert!(matches!( + err.kind(), + azure_core::error::ErrorKind::DataConversion + )); + } + + #[test] + fn rejects_invalid_json_in_v1_token() { + let token = encode_v1_payload(r#"{"kind":"unknown_variant"}"#); + let err = token.resolve().unwrap_err(); + assert!(matches!( + err.kind(), + azure_core::error::ErrorKind::DataConversion + )); + } +} + +// PipelineNodeState lives in driver internals and is not Debug-printable +// outside; provide a tiny Debug shim for the panic message above. +impl std::fmt::Debug for ResolvedToken { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ResolvedToken::ClientV1(state) => write!(f, "ClientV1({state:?})"), + ResolvedToken::ServerOpaque(s) => write!(f, "ServerOpaque({s})"), + } + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs index c30564d4f7e..9a4c8eff4ff 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs @@ -36,6 +36,7 @@ pub(crate) mod request_header_names { pub const END_EPK: &str = "x-ms-end-epk"; pub const PARTITION_KEY: &str = "x-ms-documentdb-partitionkey"; pub const PARTITION_KEY_RANGE_ID: &str = "x-ms-documentdb-partitionkeyrangeid"; + pub const MAX_ITEM_COUNT: &str = "x-ms-max-item-count"; } /// Standard Cosmos DB response header names. @@ -117,6 +118,12 @@ pub struct CosmosRequestHeaders { /// Sent on query plan requests to indicate which query capabilities the /// client supports. The backend uses this to shape its response. pub supported_query_features: Option, + + /// Maximum number of items the server should return per page + /// (`x-ms-max-item-count`). + /// + /// Applies to feed-style operations such as queries and read-feed. + pub max_item_count: Option, } impl CosmosRequestHeaders { @@ -171,6 +178,12 @@ impl CosmosRequestHeaders { HeaderValue::from(features.clone()), ); } + if let Some(max_item_count) = self.max_item_count { + headers.insert( + request_header_names::MAX_ITEM_COUNT, + HeaderValue::from(max_item_count.to_string()), + ); + } } } @@ -759,6 +772,7 @@ mod tests { offer_throughput: None, offer_autopilot_settings: None, supported_query_features: None, + max_item_count: None, }; assert_eq!( @@ -780,6 +794,7 @@ mod tests { offer_throughput: None, offer_autopilot_settings: None, supported_query_features: None, + max_item_count: None, }; let mut headers = Headers::new(); @@ -804,6 +819,7 @@ mod tests { offer_throughput: None, offer_autopilot_settings: None, supported_query_features: None, + max_item_count: None, }; let mut headers = Headers::new(); @@ -828,6 +844,7 @@ mod tests { offer_throughput: None, offer_autopilot_settings: None, supported_query_features: None, + max_item_count: None, }; let mut headers = Headers::new(); @@ -852,6 +869,7 @@ mod tests { offer_throughput: None, offer_autopilot_settings: None, supported_query_features: None, + max_item_count: None, }; let mut headers = Headers::new(); @@ -876,6 +894,7 @@ mod tests { offer_throughput: None, offer_autopilot_settings: None, supported_query_features: None, + max_item_count: None, }; let mut headers = Headers::new(); @@ -898,4 +917,28 @@ mod tests { None ); } + #[test] + fn write_to_headers_emits_max_item_count() { + let cosmos_headers = CosmosRequestHeaders { + max_item_count: Some(7), + ..Default::default() + }; + let mut headers = Headers::new(); + cosmos_headers.write_to_headers(&mut headers); + assert_eq!( + headers.get_optional_str(&HeaderName::from_static("x-ms-max-item-count")), + Some("7") + ); + } + + #[test] + fn write_to_headers_omits_max_item_count_when_none() { + let cosmos_headers = CosmosRequestHeaders::default(); + let mut headers = Headers::new(); + cosmos_headers.write_to_headers(&mut headers); + assert_eq!( + headers.get_optional_str(&HeaderName::from_static("x-ms-max-item-count")), + None + ); + } } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index fcd23dc191f..a7cfced4b9a 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -148,6 +148,15 @@ impl CosmosOperation { self } + /// Sets the maximum number of items the server should return per page + /// (the `x-ms-max-item-count` request header). + /// + /// Applies to feed-style operations such as queries and read-feed. + pub fn with_max_item_count(mut self, max_item_count: u32) -> Self { + self.request_headers.max_item_count = Some(max_item_count); + self + } + /// Sets the precondition for optimistic concurrency control. pub fn with_precondition(mut self, precondition: Precondition) -> Self { self.request_headers.precondition = Some(precondition); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs index e3cf08d7784..13de94f6161 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/mod.rs @@ -13,6 +13,7 @@ mod account_reference; mod activity_id; mod connection_string; mod consistency_level; +mod continuation_token; pub(crate) mod cosmos_headers; mod cosmos_operation; mod cosmos_resource_reference; @@ -44,6 +45,8 @@ pub use account_reference::{AccountReference, AccountReferenceBuilder, Credentia pub use activity_id::ActivityId; pub use connection_string::ConnectionString; pub(crate) use consistency_level::DefaultConsistencyLevel; +pub use continuation_token::ContinuationToken; +pub(crate) use continuation_token::ResolvedToken; pub use cosmos_headers::{ AutoscaleAutoUpgradePolicy, AutoscaleThroughputPolicy, CosmosRequestHeaders, CosmosResponseHeaders, OfferAutoscaleSettings, From 417c7a27a5e929bfe9adc45411825012632d6758 Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Mon, 11 May 2026 23:09:25 +0000 Subject: [PATCH 28/29] Dataflow cleanup: remove ChildNodes, split mod.rs, drop StubTopologyProvider --- .../azure_data_cosmos/src/driver_bridge.rs | 6 +- sdk/cosmos/azure_data_cosmos/src/feed.rs | 4 +- sdk/cosmos/azure_data_cosmos/src/hash.rs | 1 + .../src/driver/cosmos_driver.rs | 47 +-- .../src/driver/dataflow/context.rs | 111 ++++++ .../src/driver/dataflow/drain.rs | 54 +-- .../src/driver/dataflow/drained.rs | 7 +- .../src/driver/dataflow/mocks.rs | 9 +- .../src/driver/dataflow/mod.rs | 350 +----------------- .../src/driver/dataflow/node.rs | 114 ++++++ .../src/driver/dataflow/pipeline.rs | 92 +++++ .../src/driver/dataflow/planner.rs | 11 +- .../src/driver/dataflow/query_plan.rs | 3 + .../src/driver/dataflow/request.rs | 29 +- .../src/models/continuation_token.rs | 27 +- .../src/models/cosmos_headers.rs | 1 + .../src/models/cosmos_operation.rs | 1 + 17 files changed, 401 insertions(+), 466 deletions(-) create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/context.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/node.rs create mode 100644 sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs diff --git a/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs b/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs index 36fe17d9725..f2d251e34f7 100644 --- a/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs +++ b/sdk/cosmos/azure_data_cosmos/src/driver_bridge.rs @@ -11,11 +11,7 @@ use azure_core::{ http::{headers::Headers, response::Response, RawResponse, StatusCode}, Bytes, }; -use azure_data_cosmos_driver::{ - models::{CosmosOperation, CosmosResponse as DriverResponse, CosmosResponseHeaders}, - options::OperationOptions as DriverOperationOptions, - CosmosDriver, -}; +use azure_data_cosmos_driver::models::{CosmosResponse as DriverResponse, CosmosResponseHeaders}; use crate::{ constants::{ diff --git a/sdk/cosmos/azure_data_cosmos/src/feed.rs b/sdk/cosmos/azure_data_cosmos/src/feed.rs index ddda7133ff6..524502e2692 100644 --- a/sdk/cosmos/azure_data_cosmos/src/feed.rs +++ b/sdk/cosmos/azure_data_cosmos/src/feed.rs @@ -373,7 +373,7 @@ impl LiveState { /// underlying [`OperationPlan`]. Unit tests use [`Synthetic`](Self::Synthetic) /// to inject a pre-built sequence of pages. enum PageSource { - Live(LiveState), + Live(Box), #[cfg(test)] Synthetic(std::collections::VecDeque>>), #[cfg(not(test))] @@ -414,7 +414,7 @@ impl FeedItemIterator { options: OperationOptions, ) -> Self { Self { - source: PageSource::Live(LiveState::new(driver, container, plan, options)), + source: PageSource::Live(Box::new(LiveState::new(driver, container, plan, options))), current: None, _marker: PhantomData, } diff --git a/sdk/cosmos/azure_data_cosmos/src/hash.rs b/sdk/cosmos/azure_data_cosmos/src/hash.rs index 0bcc7d95740..bdaf991546e 100644 --- a/sdk/cosmos/azure_data_cosmos/src/hash.rs +++ b/sdk/cosmos/azure_data_cosmos/src/hash.rs @@ -50,6 +50,7 @@ impl From<&str> for EffectivePartitionKey { /// /// Versions 1 and 2 map directly to the driver's partition key version enum. /// Any other version falls back to V2 for forward-compatible behavior. +#[allow(dead_code)] // Currently exercised only by tests; kept for upcoming SDK API. pub fn get_hashed_partition_key_string( pk_value: &[DriverPartitionKeyValue], kind: PartitionKeyKind, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index bbf6c78961a..bd6757cdc99 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -12,7 +12,7 @@ use crate::{ dataflow::{ planner, query_plan::QueryPlan, CachedTopologyProvider, OperationPlan, PartitionRoutingRefresh, PipelineContext, PipelineNodeState, RequestExecutor, - RequestTarget, ResolvedRange, TopologyProvider, + RequestTarget, TopologyProvider, }, pipeline::operation_pipeline::OperationOverrides, routing::{ @@ -108,28 +108,6 @@ impl RequestExecutor for DriverRequestExecutor<'_> { } } -/// Stub topology provider for the current single-request pipeline. -/// -/// Cross-partition feed operations will replace this with a -/// [`CachedTopologyProvider`](super::dataflow::CachedTopologyProvider) backed -/// by the driver's partition key range cache. -struct StubTopologyProvider; - -impl TopologyProvider for StubTopologyProvider { - fn resolve_ranges<'a>( - &'a mut self, - _range: &'a crate::models::FeedRange, - _refresh: super::dataflow::PartitionRoutingRefresh, - ) -> BoxFuture<'a, azure_core::Result>> { - Box::pin(async { - Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "topology resolution not yet wired up for this pipeline", - )) - }) - } -} - /// Cosmos DB driver instance. /// /// A driver represents a connection to a specific Cosmos DB account. It is created @@ -1310,18 +1288,16 @@ impl CosmosDriver { options: &options, }; - let mut topology = match container { - Some(c) => Box::new(CachedTopologyProvider::new( - &self.pk_range_cache, - c, - |container, continuation| { - self.fetch_pk_ranges_from_service(container, continuation) - }, - )) as Box, - None => Box::new(StubTopologyProvider) as Box, - }; + let mut topology = container.map(|c| { + CachedTopologyProvider::new(&self.pk_range_cache, c, |container, continuation| { + self.fetch_pk_ranges_from_service(container, continuation) + }) + }); - let mut context = PipelineContext::new(&mut executor, topology.as_mut()); + let mut context = PipelineContext::new( + &mut executor, + topology.as_mut().map(|t| t as &mut dyn TopologyProvider), + ); plan.pipeline.next_page(&mut context).await } @@ -1641,7 +1617,8 @@ impl CosmosDriver { ); let pipeline = - planner::build_sequential_drain(&query_plan, &mut topology, operation, resume_state).await?; + planner::build_sequential_drain(&query_plan, &mut topology, operation, resume_state) + .await?; Ok(OperationPlan::new(pipeline)) } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/context.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/context.rs new file mode 100644 index 00000000000..2018fef71a3 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/context.rs @@ -0,0 +1,111 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! Execution context plumbed through [`PipelineNode::next_page`] calls. + +use futures::future::BoxFuture; + +use crate::models::{CosmosOperation, CosmosResponse, FeedRange}; + +use super::request::RequestTarget; + +/// Request execution mode for partition routing metadata. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum PartitionRoutingRefresh { + /// Use existing partition routing metadata. + UseCached, + /// Force partition routing metadata to be refreshed before executing. + ForceRefresh, +} + +/// Executes leaf request nodes through the existing operation pipeline. +pub(crate) trait RequestExecutor: Send { + /// Executes a single request node. + fn execute_request<'a>( + &'a mut self, + operation: &'a CosmosOperation, + target: RequestTarget, + partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option, + ) -> BoxFuture<'a, azure_core::Result>; +} + +/// Resolves EPK ranges to their current physical partition key ranges. +/// +/// Used by pipeline nodes to recover from partition topology changes (splits) +/// and by the planner to resolve initial query ranges. +/// The `PartitionKeyRangeCache` implements this trait in production. +pub(crate) trait TopologyProvider: Send { + /// Resolves the physical partitions that currently cover the given EPK range. + /// + /// `refresh` controls whether the topology cache is refreshed before resolving: + /// callers use [`PartitionRoutingRefresh::ForceRefresh`] for split recovery + /// and [`PartitionRoutingRefresh::UseCached`] for planning. + /// + /// Returns partition key range IDs paired with their EPK sub-ranges, ordered + /// by EPK from smallest to largest. + fn resolve_ranges<'a>( + &'a mut self, + range: &'a FeedRange, + refresh: PartitionRoutingRefresh, + ) -> BoxFuture<'a, azure_core::Result>>; +} + +/// A physical partition's EPK sub-range, as resolved from the current topology. +#[derive(Debug, Clone)] +pub(crate) struct ResolvedRange { + /// The partition key range ID for this physical partition. + pub partition_key_range_id: String, + /// The EPK sub-range within this physical partition. + pub range: FeedRange, +} + +/// Context passed through dataflow node execution. +pub(crate) struct PipelineContext<'a> { + request_executor: &'a mut dyn RequestExecutor, + topology_provider: Option<&'a mut dyn TopologyProvider>, +} + +impl<'a> PipelineContext<'a> { + /// Creates a new pipeline execution context. + /// + /// `topology_provider` is `None` for plans that cannot need topology + /// resolution (e.g. non-partitioned resource operations). If a node calls + /// [`resolve_ranges`](Self::resolve_ranges) while it is `None`, an error + /// is returned. + pub(crate) fn new( + request_executor: &'a mut dyn RequestExecutor, + topology_provider: Option<&'a mut dyn TopologyProvider>, + ) -> Self { + Self { + request_executor, + topology_provider, + } + } + + pub(crate) async fn execute_request( + &mut self, + operation: &CosmosOperation, + target: RequestTarget, + partition_routing_refresh: PartitionRoutingRefresh, + continuation: Option, + ) -> azure_core::Result { + self.request_executor + .execute_request(operation, target, partition_routing_refresh, continuation) + .await + } + + pub(crate) async fn resolve_ranges( + &mut self, + range: &FeedRange, + refresh: PartitionRoutingRefresh, + ) -> azure_core::Result> { + let provider = self.topology_provider.as_deref_mut().ok_or_else(|| { + azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "topology resolution requested for a plan that was not given a topology provider", + ) + })?; + provider.resolve_ranges(range, refresh).await + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs index ffb183403cc..60c2af982e2 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drain.rs @@ -14,7 +14,7 @@ use async_trait::async_trait; use crate::models::FeedRange; -use super::{ChildNodes, PageResult, PipelineContext, PipelineNode, PipelineNodeState}; +use super::{PageResult, PipelineContext, PipelineNode, PipelineNodeState}; /// Maximum number of consecutive split retries before giving up. /// @@ -105,15 +105,7 @@ impl PipelineNode for SequentialDrain { } } - fn children(&self) -> ChildNodes<'_> { - let (front, back) = self.children.as_slices(); - if back.is_empty() { - ChildNodes::Slice(front) - } else { - ChildNodes::Split(front, back) - } - } - + #[cfg(test)] fn into_children(self) -> Vec> { self.children.into_iter().collect() } @@ -160,7 +152,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -204,7 +196,7 @@ mod tests { SequentialDrain::new(vec![Box::new(child1), Box::new(child2), Box::new(child3)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -230,7 +222,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_drained(drain.next_page(&mut context).await); } @@ -244,7 +236,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let err = drain.next_page(&mut context).await.unwrap_err(); assert_eq!(err.to_string(), "test error"); @@ -282,7 +274,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(split_child), Box::new(trailing_child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -335,7 +327,7 @@ mod tests { ]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -376,7 +368,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(split_child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -410,7 +402,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(initial_split)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -436,7 +428,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![current]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let err = drain.next_page(&mut context).await.unwrap_err(); assert_eq!( @@ -459,7 +451,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(empty_child), Box::new(real_child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -499,7 +491,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(split_child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -533,7 +525,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(child2)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -571,7 +563,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(child2)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -609,7 +601,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(split_child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); assert_eq!( unwrap_page(drain.next_page(&mut context).await).body(), @@ -618,16 +610,6 @@ mod tests { assert_drained(drain.next_page(&mut context).await); } - #[tokio::test] - async fn children_returns_all_nodes() { - let c1 = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); - let c2 = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); - let c3 = MockLeaf::with_pages(vec![Ok(PageResult::Drained)]); - - let drain = SequentialDrain::new(vec![Box::new(c1), Box::new(c2), Box::new(c3)]); - assert_eq!(drain.children().len(), 3); - } - #[tokio::test] async fn terminal_page_pops_child_eagerly() { // The first child returns one terminal page; the drain must pop it @@ -656,7 +638,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(child1), Box::new(child2)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let page = unwrap_page(drain.next_page(&mut context).await); assert_eq!(page.body(), b"c1-final"); @@ -687,7 +669,7 @@ mod tests { let mut drain = SequentialDrain::new(vec![Box::new(only_child)]); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); match drain.next_page(&mut context).await.unwrap() { PageResult::Page { diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs index e52a36c02b4..ac21f45bbbe 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/drained.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; -use super::{ChildNodes, PageResult, PipelineContext, PipelineNode, PipelineNodeState}; +use super::{PageResult, PipelineContext, PipelineNode, PipelineNodeState}; pub(crate) struct DrainedLeaf; @@ -24,10 +24,7 @@ impl PipelineNode for DrainedLeaf { Ok(PageResult::Drained) } - fn children(&self) -> ChildNodes<'_> { - ChildNodes::None - } - + #[cfg(test)] fn into_children(self) -> Vec> { Vec::new() } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs index 5d4e866fe01..6f7019b173c 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mocks.rs @@ -9,8 +9,8 @@ use azure_core::http::StatusCode; use futures::future::BoxFuture; use super::{ - ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, - PipelineNodeState, RequestExecutor, RequestTarget, ResolvedRange, TopologyProvider, + PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, PipelineNodeState, + RequestExecutor, RequestTarget, ResolvedRange, TopologyProvider, }; use crate::{ diagnostics::DiagnosticsContextBuilder, @@ -58,10 +58,7 @@ impl PipelineNode for MockLeaf { .expect("MockLeaf: no more page results") } - fn children(&self) -> ChildNodes<'_> { - ChildNodes::None - } - + #[cfg(test)] fn into_children(self) -> Vec> { vec![] } diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs index 61044965bd8..83e2dda65cd 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/mod.rs @@ -30,361 +30,31 @@ //! pipeline (paged operations, split recovery, continuation tokens, planned //! cross-partition strategies). +mod context; mod drain; mod drained; #[cfg(test)] pub(crate) mod mocks; +mod node; +mod pipeline; pub(crate) mod planner; pub(crate) mod query_plan; mod request; mod snapshot; mod topology; -use std::ops::Index; - -use futures::future::BoxFuture; - -use crate::models::{ContinuationToken, CosmosOperation, CosmosResponse, FeedRange}; - +pub(crate) use context::{ + PartitionRoutingRefresh, PipelineContext, RequestExecutor, ResolvedRange, TopologyProvider, +}; pub(crate) use drain::SequentialDrain; pub(crate) use drained::DrainedLeaf; +pub(crate) use node::{PageResult, PipelineNode}; +pub use pipeline::OperationPlan; +pub(crate) use pipeline::Pipeline; pub(crate) use request::{Request, RequestTarget}; pub(crate) use snapshot::PipelineNodeState; pub(crate) use topology::CachedTopologyProvider; -/// Request execution mode for partition routing metadata. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum PartitionRoutingRefresh { - /// Use existing partition routing metadata. - UseCached, - /// Force partition routing metadata to be refreshed before executing. - ForceRefresh, -} - -/// Executes leaf request nodes through the existing operation pipeline. -pub(crate) trait RequestExecutor: Send { - /// Executes a single request node. - fn execute_request<'a>( - &'a mut self, - operation: &'a CosmosOperation, - target: RequestTarget, - partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option, - ) -> BoxFuture<'a, azure_core::Result>; -} - -/// Resolves EPK ranges to their current physical partition key ranges. -/// -/// Used by pipeline nodes to recover from partition topology changes (splits) -/// and by the planner to resolve initial query ranges. -/// The `PartitionKeyRangeCache` implements this trait in production. -pub(crate) trait TopologyProvider: Send { - /// Resolves the physical partitions that currently cover the given EPK range. - /// - /// `refresh` controls whether the topology cache is refreshed before resolving: - /// callers use [`PartitionRoutingRefresh::ForceRefresh`] for split recovery - /// and [`PartitionRoutingRefresh::UseCached`] for planning. - /// - /// Returns partition key range IDs paired with their EPK sub-ranges, ordered - /// by EPK from smallest to largest. - fn resolve_ranges<'a>( - &'a mut self, - range: &'a FeedRange, - refresh: PartitionRoutingRefresh, - ) -> BoxFuture<'a, azure_core::Result>>; -} - -/// A physical partition's EPK sub-range, as resolved from the current topology. -#[derive(Debug, Clone)] -pub(crate) struct ResolvedRange { - /// The partition key range ID for this physical partition. - pub partition_key_range_id: String, - /// The EPK sub-range within this physical partition. - pub range: FeedRange, -} - -/// Context passed through dataflow node execution. -pub(crate) struct PipelineContext<'a> { - request_executor: &'a mut dyn RequestExecutor, - topology_provider: &'a mut dyn TopologyProvider, -} - -impl<'a> PipelineContext<'a> { - /// Creates a new pipeline execution context. - pub(crate) fn new( - request_executor: &'a mut dyn RequestExecutor, - topology_provider: &'a mut dyn TopologyProvider, - ) -> Self { - Self { - request_executor, - topology_provider, - } - } - - async fn execute_request( - &mut self, - operation: &CosmosOperation, - target: RequestTarget, - partition_routing_refresh: PartitionRoutingRefresh, - continuation: Option, - ) -> azure_core::Result { - self.request_executor - .execute_request(operation, target, partition_routing_refresh, continuation) - .await - } - - async fn resolve_ranges( - &mut self, - range: &FeedRange, - refresh: PartitionRoutingRefresh, - ) -> azure_core::Result> { - self.topology_provider.resolve_ranges(range, refresh).await - } -} - -/// Result of a single `next_page` call on a pipeline node. -/// -/// The `Page` variant contains a large `CosmosResponse` inline, but boxing it -/// would add a heap allocation on every page fetch — the hot path. The `SplitRequired` -/// variant is rare (only on partition splits), so the size difference is acceptable. -#[must_use = "a PageResult carries the next page, drain signal, or a split request that the caller must act on"] -#[allow(clippy::large_enum_variant)] -pub(crate) enum PageResult { - /// A page of results was produced. - /// - /// `is_terminal` is `true` when this node has no more pages to emit - /// after this one — set by leaf nodes when the server returned no - /// continuation token, and propagated by intermediate nodes when their - /// last child has emitted its terminal page. Parents use this to evict - /// drained children eagerly so that snapshots of the pipeline do not - /// include children that are already done. - Page { - response: CosmosResponse, - is_terminal: bool, - }, - /// This node has no more pages to emit. - Drained, - /// This node's EPK range has split and needs to be replaced by new child nodes. - /// - /// It is the parent intermediate node's responsibility to splice - /// `replacement_nodes` into its children list (in place of the child that - /// emitted this result) and re-attempt draining from the first replacement. - /// If a node returns `SplitRequired` to a parent that does not handle - /// splits (e.g. the pipeline root), the operation fails. - SplitRequired { - /// New child nodes covering the sub-ranges of the split partition. - replacement_nodes: Vec>, - }, -} - -impl std::fmt::Debug for PageResult { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - PageResult::Page { is_terminal, .. } => { - write!(f, "Page(terminal={is_terminal})") - } - PageResult::Drained => f.write_str("Drained"), - PageResult::SplitRequired { - replacement_nodes, .. - } => write!(f, "SplitRequired({} nodes)", replacement_nodes.len()), - } - } -} - -/// An iterator over child pipeline nodes. -/// -/// Used by [`PipelineNode::children`] to expose children for diagnostics -/// without requiring a contiguous slice, which `VecDeque`-backed nodes -/// cannot always provide. -pub(crate) enum ChildNodes<'a> { - /// No children (leaf nodes). - None, - /// Children stored in a contiguous slice (e.g. a `Vec`). - Slice(&'a [Box]), - /// Children stored in a `VecDeque`, exposed as two contiguous slices. - Split(&'a [Box], &'a [Box]), -} - -impl<'a> ChildNodes<'a> { - /// Returns the total number of children. - pub fn len(&self) -> usize { - match self { - ChildNodes::None => 0, - ChildNodes::Slice(s) => s.len(), - ChildNodes::Split(a, b) => a.len() + b.len(), - } - } -} - -impl<'a> Index for ChildNodes<'a> { - type Output = Box; - - fn index(&self, index: usize) -> &Self::Output { - match self { - ChildNodes::None => panic!("index out of bounds"), - ChildNodes::Slice(s) => &s[index], - ChildNodes::Split(a, b) => { - if index < a.len() { - &a[index] - } else { - &b[index - a.len()] - } - } - } - } -} - -impl<'a> IntoIterator for ChildNodes<'a> { - type Item = &'a Box; - type IntoIter = std::iter::Chain< - std::slice::Iter<'a, Box>, - std::slice::Iter<'a, Box>, - >; - - fn into_iter(self) -> Self::IntoIter { - let empty: &[Box] = &[]; - match self { - ChildNodes::None => empty.iter().chain(empty.iter()), - ChildNodes::Slice(s) => s.iter().chain(empty.iter()), - ChildNodes::Split(a, b) => a.iter().chain(b.iter()), - } - } -} - -/// A dataflow node that emits pages and may own child nodes. -/// -/// Each `next_page` call boxes a future via `async_trait`; the per-page -/// allocation is negligible compared to the multi-millisecond network I/O -/// of a Cosmos DB request. -#[async_trait::async_trait] -pub(crate) trait PipelineNode: Send + std::any::Any { - /// Emits the next page of results, signals drain completion, or requests a split. - async fn next_page( - &mut self, - context: &mut PipelineContext<'_>, - ) -> azure_core::Result; - - /// Returns the node's children for diagnostic inspection. - fn children(&self) -> ChildNodes<'_>; - - /// Consumes this node and returns its children as a `Vec`. - fn into_children(self) -> Vec>; - - /// Snapshots this node's state for continuation-token serialization. - fn snapshot_state(&self) -> PipelineNodeState; - - /// Returns the EPK range this node currently targets, if known. - /// - /// Used by intermediate nodes (e.g. [`SequentialDrain`]) to record the - /// current cursor position when snapshotting, without needing to know - /// the concrete type of their children. Defaults to `None`. - /// - /// # Invariant - /// - /// Every node in the dataflow tree is responsible for some contiguous EPK - /// sub-range of the container key space. Intermediate nodes that drain - /// children in EPK order (such as [`SequentialDrain`]) may use the front - /// child's `feed_range()` as their own cursor; intermediates that combine - /// results across ranges (e.g. a future k-way merge for streaming - /// `ORDER BY`) are responsible for snapshotting whatever cursor - /// representation makes sense for their ordering semantics. - fn feed_range(&self) -> Option<&FeedRange> { - None - } -} - -impl dyn PipelineNode { - /// Downcasts this node to a concrete type. - pub(crate) fn downcast_ref(&self) -> Option<&T> { - (self as &dyn std::any::Any).downcast_ref::() - } - - /// Downcasts this node to a concrete type. - pub(crate) fn downcast(self: Box) -> Option> { - (self as Box).downcast::().ok() - } -} - -/// A pipeline root that owns the node tree. -pub(crate) struct Pipeline { - root: Box, -} - -impl std::fmt::Debug for Pipeline { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Pipeline").finish_non_exhaustive() - } -} - -impl Pipeline { - /// Creates a pipeline from an owned root node. - pub(crate) fn new(root: Box) -> Self { - Self { root } - } - - /// Returns a reference to the root node. - pub(crate) fn root(&self) -> &dyn PipelineNode { - &*self.root - } - - /// Consumes the pipeline and returns the root node. - pub(crate) fn into_root(self) -> Box { - self.root - } - - /// Emits the next page from the root node. - /// - /// Returns `Ok(Some(response))` for a page, `Ok(None)` when drained. - pub(crate) async fn next_page( - &mut self, - context: &mut PipelineContext<'_>, - ) -> azure_core::Result> { - match self.root.next_page(context).await? { - PageResult::Page { response, .. } => Ok(Some(response)), - PageResult::Drained => Ok(None), - // Defensive: today the root is always a `Request`, `SequentialDrain`, - // or `DrainedLeaf`, none of which can bubble `SplitRequired` up past - // their parent. If a future node type ever does, surfacing it as an - // explicit error is preferable to silently dropping the page. - PageResult::SplitRequired { .. } => Err(azure_core::Error::with_message( - azure_core::error::ErrorKind::Other, - "root node cannot request a split; splits must be handled by a parent node", - )), - } - } - - /// Snapshots the pipeline's current state for continuation-token serialization. - pub(crate) fn snapshot_state(&self) -> PipelineNodeState { - self.root.snapshot_state() - } -} - -/// An opaque plan for executing a Cosmos DB operation. -/// -/// Wraps the internal dataflow [`Pipeline`] to hide its structure from callers. -/// Produced by [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation). -pub struct OperationPlan { - pub(crate) pipeline: Pipeline, -} - -impl OperationPlan { - /// Creates an operation plan wrapping the given pipeline. - pub(crate) fn new(pipeline: Pipeline) -> Self { - Self { pipeline } - } - - /// Snapshots this plan into a [`ContinuationToken`] suitable for cross-process - /// resumption. - /// - /// Snapshotting walks the pipeline tree and serializes a minimal record of - /// each node's progress. The result can be passed back to - /// [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation) - /// (with the same operation) to resume where this plan left off. - pub fn to_continuation_token(&self) -> azure_core::Result { - ContinuationToken::encode_v1(&self.pipeline.snapshot_state()) - } -} - #[cfg(test)] mod tests { use super::mocks::*; @@ -399,7 +69,7 @@ mod tests { })]))); let mut executor = NoopRequestExecutor; let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let page = pipeline.next_page(&mut context).await.unwrap().unwrap(); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/node.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/node.rs new file mode 100644 index 00000000000..e2203f7d2ad --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/node.rs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! [`PipelineNode`] trait and [`PageResult`] returned from each pull. + +use async_trait::async_trait; + +use crate::models::{CosmosResponse, FeedRange}; + +use super::{context::PipelineContext, snapshot::PipelineNodeState}; + +/// Result of a single `next_page` call on a pipeline node. +/// +/// The `Page` variant contains a large `CosmosResponse` inline, but boxing it +/// would add a heap allocation on every page fetch — the hot path. The `SplitRequired` +/// variant is rare (only on partition splits), so the size difference is acceptable. +#[must_use = "a PageResult carries the next page, drain signal, or a split request that the caller must act on"] +#[allow(clippy::large_enum_variant)] +pub(crate) enum PageResult { + /// A page of results was produced. + /// + /// `is_terminal` is `true` when this node has no more pages to emit + /// after this one — set by leaf nodes when the server returned no + /// continuation token, and propagated by intermediate nodes when their + /// last child has emitted its terminal page. Parents use this to evict + /// drained children eagerly so that snapshots of the pipeline do not + /// include children that are already done. + Page { + response: CosmosResponse, + is_terminal: bool, + }, + /// This node has no more pages to emit. + Drained, + /// This node's EPK range has split and needs to be replaced by new child nodes. + /// + /// It is the parent intermediate node's responsibility to splice + /// `replacement_nodes` into its children list (in place of the child that + /// emitted this result) and re-attempt draining from the first replacement. + /// If a node returns `SplitRequired` to a parent that does not handle + /// splits (e.g. the pipeline root), the operation fails. + SplitRequired { + /// New child nodes covering the sub-ranges of the split partition. + replacement_nodes: Vec>, + }, +} + +impl std::fmt::Debug for PageResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PageResult::Page { is_terminal, .. } => { + write!(f, "Page(terminal={is_terminal})") + } + PageResult::Drained => f.write_str("Drained"), + PageResult::SplitRequired { + replacement_nodes, .. + } => write!(f, "SplitRequired({} nodes)", replacement_nodes.len()), + } + } +} + +/// A dataflow node that emits pages and may own child nodes. +/// +/// Each `next_page` call boxes a future via `async_trait`; the per-page +/// allocation is negligible compared to the multi-millisecond network I/O +/// of a Cosmos DB request. +#[async_trait] +pub(crate) trait PipelineNode: Send + std::any::Any { + /// Emits the next page of results, signals drain completion, or requests a split. + async fn next_page( + &mut self, + context: &mut PipelineContext<'_>, + ) -> azure_core::Result; + + /// Consumes this node and returns its children as a `Vec`. + /// + /// Used by tests to inspect the dataflow tree's shape after planning. + #[cfg(test)] + fn into_children(self) -> Vec>; + + /// Snapshots this node's state for continuation-token serialization. + fn snapshot_state(&self) -> PipelineNodeState; + + /// Returns the EPK range this node currently targets, if known. + /// + /// Used by intermediate nodes (e.g. [`super::SequentialDrain`]) to record + /// the current cursor position when snapshotting, without needing to know + /// the concrete type of their children. Defaults to `None`. + /// + /// # Invariant + /// + /// Every node in the dataflow tree is responsible for some contiguous EPK + /// sub-range of the container key space. Intermediate nodes that drain + /// children in EPK order (such as [`super::SequentialDrain`]) may use the + /// front child's `feed_range()` as their own cursor; intermediates that + /// combine results across ranges (e.g. a future k-way merge for streaming + /// `ORDER BY`) are responsible for snapshotting whatever cursor + /// representation makes sense for their ordering semantics. + fn feed_range(&self) -> Option<&FeedRange> { + None + } +} + +#[cfg(test)] +impl dyn PipelineNode { + /// Downcasts this node to a concrete type. + pub(crate) fn downcast_ref(&self) -> Option<&T> { + (self as &dyn std::any::Any).downcast_ref::() + } + + /// Downcasts this node to a concrete type. + pub(crate) fn downcast(self: Box) -> Option> { + (self as Box).downcast::().ok() + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs new file mode 100644 index 00000000000..9f5446a6617 --- /dev/null +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +//! [`Pipeline`] (driver-internal) and [`OperationPlan`] (driver-public). + +use crate::models::{ContinuationToken, CosmosResponse}; + +use super::context::PipelineContext; +use super::node::{PageResult, PipelineNode}; +use super::snapshot::PipelineNodeState; + +/// A pipeline root that owns the node tree. +pub(crate) struct Pipeline { + root: Box, +} + +impl std::fmt::Debug for Pipeline { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Pipeline").finish_non_exhaustive() + } +} + +impl Pipeline { + /// Creates a pipeline from an owned root node. + pub(crate) fn new(root: Box) -> Self { + Self { root } + } + + /// Returns a reference to the root node. + #[cfg(test)] + pub(crate) fn root(&self) -> &dyn PipelineNode { + &*self.root + } + + /// Consumes the pipeline and returns the root node. + #[cfg(test)] + pub(crate) fn into_root(self) -> Box { + self.root + } + + /// Emits the next page from the root node. + /// + /// Returns `Ok(Some(response))` for a page, `Ok(None)` when drained. + pub(crate) async fn next_page( + &mut self, + context: &mut PipelineContext<'_>, + ) -> azure_core::Result> { + match self.root.next_page(context).await? { + PageResult::Page { response, .. } => Ok(Some(response)), + PageResult::Drained => Ok(None), + // Defensive: today the root is always a `Request`, `SequentialDrain`, + // or `DrainedLeaf`, none of which can bubble `SplitRequired` up past + // their parent. If a future node type ever does, surfacing it as an + // explicit error is preferable to silently dropping the page. + PageResult::SplitRequired { .. } => Err(azure_core::Error::with_message( + azure_core::error::ErrorKind::Other, + "root node cannot request a split; splits must be handled by a parent node", + )), + } + } + + /// Snapshots the pipeline's current state for continuation-token serialization. + pub(crate) fn snapshot_state(&self) -> PipelineNodeState { + self.root.snapshot_state() + } +} + +/// An opaque plan for executing a Cosmos DB operation. +/// +/// Wraps the internal dataflow [`Pipeline`] to hide its structure from callers. +/// Produced by [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation). +pub struct OperationPlan { + pub(crate) pipeline: Pipeline, +} + +impl OperationPlan { + /// Creates an operation plan wrapping the given pipeline. + pub(crate) fn new(pipeline: Pipeline) -> Self { + Self { pipeline } + } + + /// Snapshots this plan into a [`ContinuationToken`] suitable for cross-process + /// resumption. + /// + /// Snapshotting walks the pipeline tree and serializes a minimal record of + /// each node's progress. The result can be passed back to + /// [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation) + /// (with the same operation) to resume where this plan left off. + pub fn to_continuation_token(&self) -> azure_core::Result { + ContinuationToken::encode_v1(&self.pipeline.snapshot_state()) + } +} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs index 617ff63572e..06b97193552 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/planner.rs @@ -56,7 +56,7 @@ pub(crate) fn build_trivial_pipeline( "operation target {target_desc} is not valid for resource type {resource_type}", target_desc = target_description(target), ), - ))?; + )); } let initial_continuation = match resume { @@ -286,8 +286,6 @@ fn target_description(target: &OperationTarget) -> &'static str { mod tests { use std::borrow::Cow; - use futures::FutureExt as _; - use super::*; use crate::{ driver::dataflow::{mocks::*, query_plan::QueryRange, ResolvedRange}, @@ -788,9 +786,10 @@ mod tests { let op = cross_partition_query_operation(); let mut topology = MockTopologyProvider::new(vec![Ok(vec![rr("", "FF", "pkrange-0")])]); - let pipeline = build_sequential_drain(&plan, &mut topology, &op, Some(PipelineNodeState::Drained)) - .await - .unwrap(); + let pipeline = + build_sequential_drain(&plan, &mut topology, &op, Some(PipelineNodeState::Drained)) + .await + .unwrap(); // The drained pipeline immediately yields no pages. assert!(matches!( diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs index 78f9966ba15..d2a84a95b07 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/query_plan.rs @@ -15,6 +15,7 @@ use serde::Deserialize; /// The response returned by the Gateway for a query plan request. #[derive(Debug, Default, Deserialize)] #[serde(rename_all = "camelCase")] +#[allow(dead_code)] // Wire-format fields; not all are consumed today. pub(crate) struct QueryPlan { /// The version of the query plan format. pub partitioned_query_execution_info_version: usize, @@ -35,6 +36,7 @@ pub(crate) struct QueryPlan { /// Information about a hybrid search query. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] +#[allow(dead_code)] // Wire-format fields; hybrid search isn't fully wired yet. pub(crate) struct HybridSearchQueryInfo { /// The query used for global statistics gathering. pub global_statistics_query: String, @@ -128,6 +130,7 @@ pub(crate) enum SortOrder { /// An EPK range covered by the query. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] +#[allow(dead_code)] // Inclusivity flags are wire-format; planner treats ranges uniformly. pub(crate) struct QueryRange { /// The minimum EPK value. pub min: String, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs index 87c826f5b00..aae9efd1d59 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/request.rs @@ -6,13 +6,11 @@ use async_trait::async_trait; use azure_core::http::StatusCode; -use crate::models::{ - CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode, -}; +use crate::models::{CosmosOperation, CosmosResponse, FeedRange, PartitionKey, SubStatusCode}; use super::{ - ChildNodes, PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, - PipelineNodeState, ResolvedRange, + PageResult, PartitionRoutingRefresh, PipelineContext, PipelineNode, PipelineNodeState, + ResolvedRange, }; /// The target of a request node. @@ -162,10 +160,7 @@ impl PipelineNode for Request { } } - fn children(&self) -> ChildNodes<'_> { - ChildNodes::None - } - + #[cfg(test)] fn into_children(self) -> Vec> { Vec::new() } @@ -495,7 +490,7 @@ mod tests { let mut rewritten = Vec::new(); for mut request in requests { - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); match request.next_page(&mut context).await.unwrap() { PageResult::SplitRequired { replacement_nodes } => { rewritten.extend(replacement_nodes.into_iter().map(|node| { @@ -551,7 +546,7 @@ mod tests { let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Ok(response(b"ok"))]); let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let page = unwrap_page(request.next_page(&mut context).await); @@ -571,7 +566,7 @@ mod tests { let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error()), Err(gone_error())]); let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let error = request.next_page(&mut context).await.unwrap_err(); @@ -591,7 +586,7 @@ mod tests { let mut request = Request::new(operation(), logical_partition_target(), None); let mut executor = MockRequestExecutor::new(vec![Err(non_topology_gone_error())]); let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let error = request.next_page(&mut context).await.unwrap_err(); @@ -611,7 +606,7 @@ mod tests { Ok(response_with_continuation(b"page2", Some("token-2"))), ]); let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let page1 = unwrap_page(request.next_page(&mut context).await); let page2 = unwrap_page(request.next_page(&mut context).await); @@ -639,7 +634,7 @@ mod tests { ); let mut executor = MockRequestExecutor::new(vec![Ok(response(b"page"))]); let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let page = unwrap_page(request.next_page(&mut context).await); @@ -765,7 +760,7 @@ mod tests { azure_core::error::ErrorKind::Other, "topology fetch failed", ))]); - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let err = request.next_page(&mut context).await.unwrap_err(); assert_eq!(err.to_string(), "topology fetch failed"); @@ -776,7 +771,7 @@ mod tests { let mut request = Request::new(operation(), RequestTarget::NonPartitioned, None); let mut executor = MockRequestExecutor::new(vec![Err(gone_error())]); let mut topology = NoopTopologyProvider; - let mut context = PipelineContext::new(&mut executor, &mut topology); + let mut context = PipelineContext::new(&mut executor, Some(&mut topology)); let err = request.next_page(&mut context).await.unwrap_err(); assert!(is_partition_topology_change(&err)); diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs index 9bc0c0766eb..146fbe541e7 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/continuation_token.rs @@ -111,6 +111,17 @@ pub(crate) enum ResolvedToken { ServerOpaque(String), } +// `PipelineNodeState` lives in driver internals and is not Debug-printable +// outside; provide a tiny Debug shim so test panic messages can include it. +impl std::fmt::Debug for ResolvedToken { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ResolvedToken::ClientV1(state) => write!(f, "ClientV1({state:?})"), + ResolvedToken::ServerOpaque(s) => write!(f, "ServerOpaque({s})"), + } + } +} + /// Returns `Some(N)` if `s` starts with `c.` for some unsigned integer `N`, /// otherwise `None`. /// @@ -221,9 +232,8 @@ mod tests { #[test] fn resolve_v1_request_state_with_server_continuation() { - let token = encode_v1_payload( - r#"{"kind":"request","server_continuation":"opaque-srv-token"}"#, - ); + let token = + encode_v1_payload(r#"{"kind":"request","server_continuation":"opaque-srv-token"}"#); match token.resolve().unwrap() { ResolvedToken::ClientV1(state) => assert_eq!( state, @@ -310,14 +320,3 @@ mod tests { )); } } - -// PipelineNodeState lives in driver internals and is not Debug-printable -// outside; provide a tiny Debug shim for the panic message above. -impl std::fmt::Debug for ResolvedToken { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - ResolvedToken::ClientV1(state) => write!(f, "ClientV1({state:?})"), - ResolvedToken::ServerOpaque(s) => write!(f, "ServerOpaque({s})"), - } - } -} diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs index 9a4c8eff4ff..7a03b6d29f9 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_headers.rs @@ -34,6 +34,7 @@ pub(crate) mod request_header_names { pub const THROUGHPUT_BUCKET: &str = "x-ms-cosmos-throughput-bucket"; pub const START_EPK: &str = "x-ms-start-epk"; pub const END_EPK: &str = "x-ms-end-epk"; + #[allow(dead_code)] // Reserved for future direct partition-key header writes. pub const PARTITION_KEY: &str = "x-ms-documentdb-partitionkey"; pub const PARTITION_KEY_RANGE_ID: &str = "x-ms-documentdb-partitionkeyrangeid"; pub const MAX_ITEM_COUNT: &str = "x-ms-max-item-count"; diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs index a7cfced4b9a..3007a4243d2 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/models/cosmos_operation.rs @@ -626,6 +626,7 @@ impl CosmosOperation { /// Creates a read-feed request for partition key ranges in a container. /// /// Used to populate the partition key range cache for topology resolution. + #[allow(dead_code)] // Reserved for an upcoming pk-range cache refresh path. pub(crate) fn read_partition_key_ranges(container: ContainerReference) -> Self { let resource_ref: CosmosResourceReference = CosmosResourceReference::from(container) .with_resource_type(ResourceType::PartitionKeyRange) From cc744a57e6d31c1eb9821dce1b656924a7df91ab Mon Sep 17 00:00:00 2001 From: Ashley Stanton-Nurse Date: Tue, 12 May 2026 04:17:39 +0000 Subject: [PATCH 29/29] fix doc issues --- .../azure_data_cosmos_driver/src/driver/cosmos_driver.rs | 2 +- .../azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs index 944e5eeb8c6..0e79680df75 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/cosmos_driver.rs @@ -1242,7 +1242,7 @@ impl CosmosDriver { /// Executes a point operation (read/write item, read database, etc.) without a pre-planned pipeline. /// - /// This is a convenience method around [`execute_operation`] that asserts at debug-time that the operation + /// This is a convenience method around [`execute_operation`](CosmosDriver::execute_operation) that asserts at debug-time that the operation /// does not return an empty page. pub async fn execute_point_operation( &self, diff --git a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs index 9f5446a6617..50f1a75be84 100644 --- a/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs +++ b/sdk/cosmos/azure_data_cosmos_driver/src/driver/dataflow/pipeline.rs @@ -65,9 +65,8 @@ impl Pipeline { } } -/// An opaque plan for executing a Cosmos DB operation. +/// A plan for executing a Cosmos DB operation. /// -/// Wraps the internal dataflow [`Pipeline`] to hide its structure from callers. /// Produced by [`CosmosDriver::plan_operation`](crate::driver::CosmosDriver::plan_operation). pub struct OperationPlan { pub(crate) pipeline: Pipeline,