apache
diff --git a/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 7 additions & 2 deletions b/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎sql/api/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala‎
Lines changed: 20 additions & 0 deletions b/‎sql/api/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Changelog.java‎
Lines changed: 40 additions & 1 deletion b/‎sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Changelog.java‎
Lines changed: 40 additions & 1 deletion
@@ -666,6 +666,11 @@
       "The Change Data Capture (CDC) connector violated the `Changelog` contract at runtime."
     ],
     "subClass" : {
+      "NULL_COMMIT_TIMESTAMP" : {
+        "message" : [
+          "Connector emitted a row with a NULL `_commit_timestamp` on a streaming read engaging post-processing. The `Changelog` contract requires `_commit_timestamp` to be non-NULL for streaming reads, since post-processing uses it as event time to advance the watermark."
+        ]
+      },
       "UNEXPECTED_CHANGE_TYPE" : {
         "message" : [
           "Connector emitted a row with a `_change_type` value that is not one of the four supported types (`insert`, `delete`, `update_preimage`, `update_postimage`). The `Changelog` contract requires every emitted row to carry one of these four values."
@@ -3303,9 +3308,9 @@
           "`startingVersion` is required when `endingVersion` is specified for CDC queries."
         ]
       },
-      "STREAMING_POST_PROCESSING_NOT_SUPPORTED" : {
+      "STREAMING_NET_CHANGES_NOT_SUPPORTED" : {
         "message" : [
-          "Change Data Capture (CDC) streaming reads on connector `<changelogName>` do not yet support post-processing (carry-over removal, update detection, or net change computation). The requested combination of options would require post-processing, which is currently only available for batch reads. Use a batch read, or set `deduplicationMode = none` and `computeUpdates = false` to receive raw change rows in streaming."
+          "Change Data Capture (CDC) streaming reads on connector `<changelogName>` do not yet support net change computation (`deduplicationMode = netChanges`). Net change computation reasons over the entire requested version range and is currently only available for batch reads. Use a batch read, or set `deduplicationMode` to `none` or `dropCarryovers` for streaming."
         ]
       },
       "UPDATE_DETECTION_REQUIRES_CARRY_OVER_REMOVAL" : {
 
@@ -131,6 +131,26 @@ abstract class DataStreamReader {
    *     .changes("my_table")
    * }}}
    *
+   * Streaming reads support the same `computeUpdates` and `deduplicationMode = dropCarryovers`
+   * post-processing as batch reads. `deduplicationMode = netChanges` is currently batch-only --
+   * it requires reasoning over the entire requested range, which is not incrementalized yet.
+   * Requesting it on a streaming read raises an explicit
+   * `INVALID_CDC_OPTION.STREAMING_NET_CHANGES_NOT_SUPPORTED` error.
+   *
+   * When the requested options engage row-level post-processing (carry-over removal or update
+   * detection), the rewrite injects an internal `EventTimeWatermark` on `_commit_timestamp` and a
+   * stateful streaming aggregate. Two implications follow:
+   *   - A commit's events are emitted in the next micro-batch after the commit is read
+   *     (append-mode aggregate eviction is `eventTime &lt;= watermark`, and the watermark
+   *     advances to the max `_commit_timestamp` observed in the previous batch). A stream that
+   *     reads its last commit and stops will keep that commit's events in state until a
+   *     subsequent (no-data) micro-batch fires.
+   *   - The query is constrained to `Append` output mode; `Update` and `Complete` are rejected at
+   *     writer-start time with `STREAMING_OUTPUT_MODE.UNSUPPORTED_OPERATION`. The internal
+   *     watermark metadata is stripped from the user-visible `_commit_timestamp` output, so
+   *     downstream user-supplied watermarks on other columns do not interact with it via the
+   *     global multi-watermark policy.
+   *
    * @param tableName
    *   a qualified or unqualified name that designates a table.
    * @since 4.2.0
 
@@ -35,8 +35,41 @@
  *       {@code update_preimage}, or {@code update_postimage}</li>
  *   <li>{@code _commit_version} (connector-defined type, e.g. LONG) — the version containing
  *       this change</li>
- *   <li>{@code _commit_timestamp} (TIMESTAMP) — the timestamp of the commit</li>
+ *   <li>{@code _commit_timestamp} (TIMESTAMP) -- the timestamp of the commit. All rows
+ *       belonging to a single {@code _commit_version} must share the same
+ *       {@code _commit_timestamp}. For streaming reads with post-processing enabled,
+ *       two additional requirements apply:
+ *       <ol>
+ *         <li>All rows of a single commit must appear in the same micro-batch (i.e.
+ *             micro-batch boundaries align with commit boundaries).</li>
+ *         <li>Each micro-batch's rows must have {@code _commit_timestamp} strictly
+ *             greater than the maximum {@code _commit_timestamp} of any prior
+ *             micro-batch.</li>
+ *       </ol>
+ *       Streaming post-processing uses {@code _commit_timestamp} as event time with a
+ *       zero-delay watermark, so once a micro-batch observes max event time T the
+ *       global watermark advances to T. Both Spark's late-event filter and its
+ *       state-eviction predicate then use {@code eventTime <= T} -- so any later row
+ *       at {@code _commit_timestamp <= T} (whether from the same commit split across
+ *       batches, a different commit emitted later, or simply an out-of-order commit)
+ *       is silently dropped as late. Requirement 1 keeps a single commit's rows
+ *       together; requirement 2 keeps distinct commits in strictly increasing
+ *       event-time order across batches. Multiple distinct commits with equal
+ *       {@code _commit_timestamp} are allowed within a single micro-batch -- only
+ *       <em>across</em> batches does timestamp progression need to be strictly
+ *       increasing. Atomic-commit CDC connectors (e.g. Delta versions, Iceberg
+ *       snapshots) that derive {@code _commit_timestamp} from wall-clock time at
+ *       commit time naturally satisfy both requirements.
+ *       {@code _commit_timestamp} must be non-{@code NULL} on every row of a streaming
+ *       read engaging post-processing. The row-level rewrite raises
+ *       {@code CHANGELOG_CONTRACT_VIOLATION.NULL_COMMIT_TIMESTAMP} on any row that
+ *       violates this; without the guard a NULL group key would never satisfy the
+ *       watermark eviction predicate and the row would sit in state indefinitely</li>
  * </ul>
+ * <p>
+ * Streaming reads support carry-over removal and update detection but not net change
+ * computation. The latter requires reasoning over the entire requested range and is
+ * batch-only.
  *
  * @since 4.2.0
  */
@@ -81,6 +114,12 @@ public interface Changelog {
    * Spark will collapse multiple changes per row identity into the net effect.
    * If {@code false}, the connector guarantees at most one change per row identity across
    * the entire changelog range, and Spark will skip net change computation.
+   * <p>
+   * Note this flag is range-scoped (across all commits in the request), not
+   * micro-batch-scoped. Streaming CDC reads currently reject
+   * {@code deduplicationMode = netChanges} because the per-row-identity collapse cannot
+   * be incrementalized: a row's full history may span an unbounded number of
+   * micro-batches.
    */
   boolean containsIntermediateChanges();