MaterializeInc
diff --git a/‎src/interchange/src/envelopes.rs‎
Lines changed: 5 additions & 116 deletions b/‎src/interchange/src/envelopes.rs‎
Lines changed: 5 additions & 116 deletions
diff --git a/‎src/storage/src/render/sinks.rs‎
Lines changed: 97 additions & 70 deletions b/‎src/storage/src/render/sinks.rs‎
Lines changed: 97 additions & 70 deletions
@@ -11,18 +11,14 @@ use std::collections::BTreeMap;
 use std::iter;
 use std::sync::LazyLock;
 
-use differential_dataflow::operators::arrange::Arranged;
 use differential_dataflow::trace::implementations::BatchContainer;
-use differential_dataflow::trace::{BatchReader, Cursor, TraceReader};
-use differential_dataflow::{AsCollection, VecCollection};
-use itertools::{Either, EitherOrBoth, Itertools};
+use differential_dataflow::trace::{BatchReader, Cursor};
+use itertools::{Either, EitherOrBoth};
 use maplit::btreemap;
 use mz_ore::cast::CastFrom;
 use mz_repr::{
     CatalogItemId, ColumnName, Datum, Diff, Row, RowPacker, SqlColumnType, SqlScalarType,
 };
-use timely::dataflow::channels::pact::Pipeline;
-use timely::dataflow::operators::Operator;
 
 use crate::avro::DiffPair;
 
@@ -32,11 +28,9 @@ use crate::avro::DiffPair;
 /// Within a key, diffs are partitioned by sign into retractions (befores) and
 /// insertions (afters), sorted by timestamp, and zipped into `DiffPair`s via a
 /// merge-join. Pairs are emitted in ascending timestamp order for a given key;
-/// no ordering is guaranteed across keys.
-///
-/// This is the batch-level cursor walk underlying [`combine_at_timestamp`].
-/// Sinks that consume an arrangement directly can call this inside their own
-/// operator to stream pairs without materializing a `Vec<DiffPair>` per group.
+/// no ordering is guaranteed across keys. Callers are responsible for tracking
+/// `(key, timestamp)` boundaries themselves if they need to detect groups
+/// with more than one pair (e.g., for primary-key violation checks).
 pub fn for_each_diff_pair<B, F>(batch: &B, mut on_diff_pair: F)
 where
     B: BatchReader<Diff = Diff>,
@@ -106,111 +100,6 @@ where
     }
 }
 
-/// Given a stream of batches, produce a stream of groups of DiffPairs, grouped
-/// by key, at each timestamp.
-///
-// This is useful for some sink envelopes (e.g., Debezium and Upsert), which
-// need to do specific logic based on the _entire_ set of before/after diffs for
-// a given key at each timestamp.
-pub fn combine_at_timestamp<'scope, Tr>(
-    arranged: Arranged<'scope, Tr>,
-) -> VecCollection<
-    'scope,
-    Tr::Time,
-    (
-        <Tr::KeyContainer as BatchContainer>::Owned,
-        Vec<DiffPair<Tr::ValOwn>>,
-    ),
-    Diff,
->
-where
-    Tr: Clone + TraceReader<Diff = Diff, ValOwn: 'static, Time: Copy>,
-    <Tr::KeyContainer as BatchContainer>::Owned: Clone + 'static,
-{
-    arranged
-        .stream
-        .unary(Pipeline, "combine_at_timestamp", move |_, _| {
-            move |input, output| {
-                input.for_each_time(|time, batches| {
-                    let mut session = output.session(&time);
-                    for batch in batches.flat_map(IntoIterator::into_iter) {
-                        let mut befores = vec![];
-                        let mut afters = vec![];
-
-                        let mut cursor = batch.cursor();
-                        while cursor.key_valid(batch) {
-                            let k = cursor.key(batch);
-
-                            // Partition updates into retractions (befores)
-                            // and insertions (afters).
-                            while cursor.val_valid(batch) {
-                                let v = cursor.val(batch);
-                                cursor.map_times(batch, |t, diff| {
-                                    let diff = Tr::owned_diff(diff);
-                                    let update = (
-                                        Tr::owned_time(t),
-                                        Tr::owned_val(v),
-                                        usize::cast_from(diff.unsigned_abs()),
-                                    );
-                                    if diff < Diff::ZERO {
-                                        befores.push(update);
-                                    } else {
-                                        afters.push(update);
-                                    }
-                                });
-                                cursor.step_val(batch);
-                            }
-
-                            // Sort by timestamp.
-                            befores.sort_by_key(|(t, _v, _diff)| *t);
-                            afters.sort_by_key(|(t, _v, _diff)| *t);
-
-                            // Convert diff into unary representation.
-                            let befores = befores
-                                .drain(..)
-                                .flat_map(|(t, v, cnt)| iter::repeat((t, v)).take(cnt));
-                            let afters = afters
-                                .drain(..)
-                                .flat_map(|(t, v, cnt)| iter::repeat((t, v)).take(cnt));
-
-                            // At each timestamp, zip together the insertions
-                            // and retractions into diff pairs.
-                            let groups = itertools::merge_join_by(
-                                befores,
-                                afters,
-                                |(t1, _v1), (t2, _v2)| t1.cmp(t2),
-                            )
-                            .map(|pair| match pair {
-                                EitherOrBoth::Both((t, before), (_t, after)) => {
-                                    (t, Some(before.clone()), Some(after.clone()))
-                                }
-                                EitherOrBoth::Left((t, before)) => (t, Some(before.clone()), None),
-                                EitherOrBoth::Right((t, after)) => (t, None, Some(after.clone())),
-                            })
-                            .chunk_by(|(t, _before, _after)| *t);
-
-                            // For each timestamp, emit the group of
-                            // `DiffPair`s.
-                            for (t, group) in &groups {
-                                let group = group
-                                    .map(|(_t, before, after)| DiffPair { before, after })
-                                    .collect();
-                                session.give((
-                                    (<Tr::KeyContainer as BatchContainer>::into_owned(k), group),
-                                    t,
-                                    Diff::ONE,
-                                ));
-                            }
-
-                            cursor.step_key(batch);
-                        }
-                    }
-                });
-            }
-        })
-        .as_collection()
-}
-
 // NOTE(benesch): statically allocating transient IDs for the
 // transaction and row types is a bit of a hack to allow us to attach
 // custom names to these types in the generated Avro schema. In the
 
@@ -12,13 +12,11 @@
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
-use differential_dataflow::operators::arrange::Arrange;
+use differential_dataflow::operators::arrange::{Arrange, Arranged, TraceAgent};
 use differential_dataflow::trace::implementations::ord_neu::{
     OrdValBatcher, OrdValSpine, RcOrdValBuilder,
 };
 use differential_dataflow::{AsCollection, Hashable, VecCollection};
-use mz_interchange::avro::DiffPair;
-use mz_interchange::envelopes::combine_at_timestamp;
 use mz_persist_client::operators::shard_source::SnapshotMode;
 use mz_repr::{Datum, Diff, GlobalId, Row, Timestamp};
 use mz_storage_operators::persist_source;
@@ -33,6 +31,10 @@ use tracing::warn;
 use crate::healthcheck::HealthStatusMessage;
 use crate::storage_state::StorageState;
 
+/// The concrete trace type used to hand arranged sink input to
+/// [`SinkRender::render_sink`].
+pub(crate) type SinkTrace = TraceAgent<OrdValSpine<Option<Row>, Row, Timestamp, Diff>>;
+
 /// _Renders_ complete _differential_ collections
 /// that represent the sink and its errors as requested
 /// by the original `CREATE SINK` statement.
@@ -77,46 +79,40 @@ pub(crate) fn render_sink<'scope>(
         );
         tokens.extend(persist_tokens);
 
-        let ok_collection =
-            zip_into_diff_pairs(sink_id, sink, &*sink_render, ok_collection.as_collection());
+        let arranged = arrange_sink_input(&*sink_render, ok_collection.as_collection());
+        let key_is_synthetic = sink_render.get_key_indices().is_none()
+            && sink_render.get_relation_key_indices().is_none();
 
         let (health, sink_tokens) = sink_render.render_sink(
             storage_state,
             sink,
             sink_id,
-            ok_collection,
+            arranged,
+            key_is_synthetic,
             err_collection.as_collection(),
         );
         tokens.extend(sink_tokens);
         (health.leave(outer_scope), tokens)
     })
 }
 
-/// Zip the input to a sink so that updates to the same key appear as
-/// `DiffPair`s.
-fn zip_into_diff_pairs<'scope>(
-    sink_id: GlobalId,
-    sink: &StorageSinkDesc<CollectionMetadata, mz_repr::Timestamp>,
+/// Extract the sink's key column(s) from each row and arrange the resulting
+/// `(Option<Row>, Row)` collection by key.
+///
+/// Prefers the user-specified sink key, falling back to any natural key of the
+/// underlying relation. When neither exists, a synthetic per-row hash is used
+/// purely to distribute work across workers — in that case the sink should
+/// treat the key as absent (`key_is_synthetic`).
+fn arrange_sink_input<'scope>(
     sink_render: &dyn SinkRender<'scope>,
     collection: VecCollection<'scope, Timestamp, Row, Diff>,
-) -> VecCollection<'scope, Timestamp, (Option<Row>, DiffPair<Row>), Diff> {
-    // We need to consolidate the collection and group records by their key.
-    // We'll first attempt to use the explicitly declared key when the sink was
-    // created. If no such key exists, we'll use a key of the sink's underlying
-    // relation, if one exists.
-    //
-    // If no such key exists, we'll generate a synthetic key based on the hash
-    // of the row, just for purposes of distributing work among workers. In this
-    // case the key offers no uniqueness guarantee.
-
-    let user_key_indices = sink_render.get_key_indices();
-    let relation_key_indices = sink_render.get_relation_key_indices();
-    let key_indices = user_key_indices
-        .or(relation_key_indices)
+) -> Arranged<'scope, SinkTrace> {
+    let key_indices = sink_render
+        .get_key_indices()
+        .or_else(|| sink_render.get_relation_key_indices())
         .map(|k| k.to_vec());
-    let key_is_synthetic = key_indices.is_none();
 
-    let collection = match key_indices {
+    let keyed = match key_indices {
         None => collection.map(|row| (Some(Row::pack(Some(Datum::UInt64(row.hashed())))), row)),
         Some(key_indices) => {
             let mut datum_vec = mz_repr::DatumVec::new();
@@ -132,55 +128,77 @@ fn zip_into_diff_pairs<'scope>(
         }
     };
 
-    // Group messages by key at each timestamp.
-    //
     // Allow access to `arrange_named` because we cannot access Mz's wrapper
     // from here. TODO(database-issues#5046): Revisit with cluster unification.
     #[allow(clippy::disallowed_methods)]
-    let mut collection =
-        combine_at_timestamp(collection.arrange_named::<OrdValBatcher<_,_,_,_>, RcOrdValBuilder<_,_,_,_>, OrdValSpine<_, _, _, _>>("Arrange Sink"));
-
-    // If there is no user-specified key, remove the synthetic key.
-    //
-    // We don't want the synthetic key to appear in the sink's actual output; we
-    // just needed a value to use to distribute work.
-    if user_key_indices.is_none() {
-        collection = collection.map(|(_key, value)| (None, value))
+    keyed.arrange_named::<OrdValBatcher<_, _, _, _>, RcOrdValBuilder<_, _, _, _>, OrdValSpine<_, _, _, _>>("Arrange Sink")
+}
+
+/// Rate-limited detector for primary-key uniqueness violations as a sink's
+/// cursor walk observes `(key, timestamp)` groups.
+///
+/// Call [`PkViolationWarner::observe`] once per emitted `DiffPair`. When the
+/// current `(key, timestamp)` group changes — or when input batches finish —
+/// call [`PkViolationWarner::flush`] so the accumulated count is evaluated.
+///
+/// Keys are identified by their `Hashable::hashed()` value rather than held
+/// by value, so the hot observe path does no `Row` clones. A hash collision
+/// can mask a PK violation but this is a purely diagnostic check, so the
+/// trade-off is acceptable.
+pub(crate) struct PkViolationWarner {
+    sink_id: GlobalId,
+    from_id: GlobalId,
+    last_warning: Instant,
+    current: Option<(u64, Timestamp)>,
+    count: usize,
+}
+
+impl PkViolationWarner {
+    pub fn new(sink_id: GlobalId, from_id: GlobalId) -> Self {
+        Self {
+            sink_id,
+            from_id,
+            last_warning: Instant::now(),
+            current: None,
+            count: 0,
+        }
     }
 
-    collection.flat_map({
-        let mut last_warning = Instant::now();
-        let from_id = sink.from;
-        move |(mut k, vs)| {
-            // If the key is not synthetic, emit a warning to internal logs if
-            // we discover a primary key violation.
-            //
-            // TODO: put the sink in a user-visible errored state instead of
-            // only logging internally. See:
-            // https://github.com/MaterializeInc/database-issues/issues/5099.
-            if !key_is_synthetic && vs.len() > 1 {
-                // We rate limit how often we emit this warning to avoid
-                // flooding logs.
-                let now = Instant::now();
-                if now.duration_since(last_warning) >= Duration::from_secs(10) {
-                    last_warning = now;
-                    warn!(
-                        ?sink_id,
-                        ?from_id,
-                        "primary key error: expected at most one update per key and timestamp; \
-                            this can happen when the configured sink key is not a primary key of \
-                            the sinked relation"
-                    )
-                }
-            }
+    /// Record that a `DiffPair` was observed at `(key, time)`. If this starts
+    /// a new group, the previous group's count is flushed (and warned about
+    /// if the count was > 1).
+    pub fn observe(&mut self, key: &Option<Row>, time: Timestamp) {
+        // `None` keys hash to a distinct sentinel from any `Row::hashed()`;
+        // the exact constant doesn't matter for correctness (it just needs
+        // to be stable).
+        let hash = key.as_ref().map(|k| k.hashed()).unwrap_or(u64::MAX);
+        let same = self.current == Some((hash, time));
+        if !same {
+            self.flush();
+            self.current = Some((hash, time));
+        }
+        self.count += 1;
+    }
 
-            let max_idx = vs.len() - 1;
-            vs.into_iter().enumerate().map(move |(idx, dp)| {
-                let k = if idx == max_idx { k.take() } else { k.clone() };
-                (k, dp)
-            })
+    /// Flush the pending `(key, timestamp)` group count. Emits a
+    /// rate-limited warning if more than one `DiffPair` was observed.
+    pub fn flush(&mut self) {
+        if self.count > 1 {
+            let now = Instant::now();
+            if now.duration_since(self.last_warning) >= Duration::from_secs(10) {
+                self.last_warning = now;
+                warn!(
+                    sink_id = ?self.sink_id,
+                    from_id = ?self.from_id,
+                    "primary key error: expected at most one update per key and timestamp; \
+                        this can happen when the configured sink key is not a primary key of \
+                        the sinked relation"
+                );
+            }
         }
-    })
+        self.current = None;
+        self.count = 0;
+    }
 }
 
 /// A type that can be rendered as a dataflow sink.
@@ -194,12 +212,21 @@ pub(crate) trait SinkRender<'scope> {
     fn get_relation_key_indices(&self) -> Option<&[usize]>;
 
     /// Renders the sink's dataflow.
+    ///
+    /// The sink receives the input as an arrangement keyed on `Option<Row>`.
+    /// The sink is responsible for walking the arrangement (typically via
+    /// [`mz_interchange::envelopes::for_each_diff_pair`]) and handling any
+    /// envelope-specific diff-pair construction. When `key_is_synthetic` is
+    /// true the arrangement's key is a per-row hash used only for worker
+    /// distribution — the sink should treat the key as absent when producing
+    /// output.
     fn render_sink(
         &self,
         storage_state: &mut StorageState,
         sink: &StorageSinkDesc<CollectionMetadata, Timestamp>,
         sink_id: GlobalId,
-        sinked_collection: VecCollection<'scope, Timestamp, (Option<Row>, DiffPair<Row>), Diff>,
+        arranged: Arranged<'scope, SinkTrace>,
+        key_is_synthetic: bool,
         err_collection: VecCollection<'scope, Timestamp, DataflowError, Diff>,
     ) -> (
         StreamVec<'scope, Timestamp, HealthStatusMessage>,