From 76dc41ad6aeb57f5a35ab07df82e03bc3724328e Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Mon, 12 Jan 2026 22:11:47 -0800
Subject: [PATCH 01/24] feat: Introducing PartitionedVector

This commit introduces `PartitionedVector` - a low-level execution
abstraction that provides an in-place, partition-aware layout of a
vector based on per-row partition IDs.

1. **In-place rearrangement**: Rearrange vector data in memory without
   creating multiple copies
2. **Buffer reuse**: Allow reuse of temporary buffers across multiple
   partitioning operations
3. **Minimal abstraction**: Similar to `DecodedVector`, focus on
   efficient execution rather than operator semantics
4. **Thread-unsafe by design**: Optimized for single-threaded execution
   contexts

For more information please see https://github.com/IBM/velox/issues/1703

Alchemy-item: (ID = 1150) Introducing PartitionedVector commit 1/1 - 960f41b03895ba2fc3ea3853daa035c411af549c
---
 velox/vector/CMakeLists.txt                   |   1 +
 velox/vector/PartitionedVector.cpp            | 343 ++++++++++++++++++
 velox/vector/PartitionedVector.h              | 244 +++++++++++++
 velox/vector/tests/CMakeLists.txt             |   1 +
 velox/vector/tests/PartitionedVectorTest.cpp  | 168 +++++++++
 velox/vector/tests/utils/CMakeLists.txt       |   2 +-
 .../tests/utils/PartitionedVectorTestBase.cpp | 126 +++++++
 .../tests/utils/PartitionedVectorTestBase.h   |  42 +++
 8 files changed, 926 insertions(+), 1 deletion(-)
 create mode 100644 velox/vector/PartitionedVector.cpp
 create mode 100644 velox/vector/PartitionedVector.h
 create mode 100644 velox/vector/tests/PartitionedVectorTest.cpp
 create mode 100644 velox/vector/tests/utils/PartitionedVectorTestBase.cpp
 create mode 100644 velox/vector/tests/utils/PartitionedVectorTestBase.h

diff --git a/velox/vector/CMakeLists.txt b/velox/vector/CMakeLists.txt
index 9fd4f2ca9ea..6f76bc9bfb9 100644
--- a/velox/vector/CMakeLists.txt
+++ b/velox/vector/CMakeLists.txt
@@ -22,6 +22,7 @@ velox_add_library(
   FlatVector.cpp
   LazyVector.cpp
   MapConcat.cpp
+  PartitionedVector.cpp
   SelectivityVector.cpp
   SequenceVector.cpp
   SimpleVector.cpp
diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
new file mode 100644
index 00000000000..43fb7fb5d53
--- /dev/null
+++ b/velox/vector/PartitionedVector.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/vector/PartitionedVector.h"
+
+#include "velox/vector/FlatVector.h"
+
+namespace facebook::velox {
+
+using Byte = uint8_t;
+using BitIndex = uint8_t;
+
+namespace {
+
+inline void countPartitionSizes(
+    const std::vector<uint32_t>& partitions,
+    vector_size_t* rowCounts) {
+  VELOX_DCHECK_NOT_NULL(rowCounts);
+
+  for (vector_size_t i = 0; i < partitions.size(); i++) {
+    rowCounts[partitions[i]]++;
+  }
+}
+
+inline void prefixSum(vector_size_t* offsets, uint32_t numPartitions) {
+  for (uint32_t i = 1; i < numPartitions; i++) {
+    offsets[i] += offsets[i - 1];
+  }
+}
+
+inline void calculateOffsets(
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    vector_size_t* endPartitionOffsets) {
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+
+  if (numPartitions > 1) {
+    std::fill_n(endPartitionOffsets, numPartitions, 0);
+    countPartitionSizes(partitions, endPartitionOffsets);
+    prefixSum(endPartitionOffsets, numPartitions);
+  } else {
+    endPartitionOffsets[0] = static_cast<vector_size_t>(partitions.size());
+  }
+}
+
+// endPartitionOffsets is an array of length numPartitions where each entry i is
+// the exclusive end position of partition i. cursorPartitionOffsets is
+// initialized such that cursorPartitionOffsets[0] = 0 and for i>0,
+// cursorPartitionOffsets[i] = endPartitionOffsets[i-1], i.e., the inclusive
+// begin positions.
+void initializeCursorPartitionOffsets(
+    BufferPtr& cursorPartitionOffsets,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+  VELOX_DCHECK_EQ(
+      endPartitionOffsets->size(), numPartitions * sizeof(vector_size_t));
+
+  ensureCapacity<vector_size_t>(cursorPartitionOffsets, numPartitions, pool);
+  cursorPartitionOffsets->asMutable<vector_size_t>()[0] = 0;
+  std::memcpy(
+      &cursorPartitionOffsets->asMutable<vector_size_t>()[1],
+      endPartitionOffsets->as<vector_size_t>(),
+      sizeof(vector_size_t) * (numPartitions - 1));
+  cursorPartitionOffsets->setSize(numPartitions * sizeof(vector_size_t));
+}
+
+// In-place partitioning algorithm for fixed-width values
+// This algorithm rearranges elements so that each element ends up in its target
+// partition by repeatedly swapping elements until the current element belongs
+// to the current partition
+template <typename T>
+void partitionFixedWidthValuesInPlace(
+    T* values,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    vector_size_t* cursorPartitionOffsets,
+    const vector_size_t* endPartitionOffsets) {
+  VELOX_DCHECK_NOT_NULL(values);
+  VELOX_DCHECK_NOT_NULL(cursorPartitionOffsets);
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+
+  for (auto currentPartition = 0; currentPartition < numPartitions;
+       currentPartition++) {
+    vector_size_t& offset = cursorPartitionOffsets[currentPartition];
+    vector_size_t endOffset = endPartitionOffsets[currentPartition];
+
+    while (offset < endOffset) {
+      uint32_t targetPartition = partitions[offset];
+
+      while (targetPartition != currentPartition) {
+        auto destinationOffset = cursorPartitionOffsets[targetPartition]++;
+        std::swap(values[destinationOffset], values[offset]);
+        targetPartition = partitions[destinationOffset];
+      }
+      offset = ++cursorPartitionOffsets[currentPartition];
+    }
+  }
+}
+
+template <typename T>
+void partitionFixedWidthValues(
+    BufferPtr& inputBuffer,
+    const std::vector<uint32_t>& partitions,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(inputBuffer);
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+
+  auto input = inputBuffer->asMutable<T>();
+
+  initializeCursorPartitionOffsets(
+      ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
+
+  vector_size_t* rawCursorOffsets =
+      ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
+  const vector_size_t* rawEndOffsets =
+      endPartitionOffsets->asMutable<vector_size_t>();
+
+  partitionFixedWidthValuesInPlace<T>(
+      input, partitions, numPartitions, rawCursorOffsets, rawEndOffsets);
+}
+
+// Swap two bits between two bytes
+void swapBit(Byte& byte1, BitIndex bit1, Byte& byte2, BitIndex bit2) {
+  // Calculate the difference between the bits
+  char bitDiff = ((byte1 >> bit1) & 1) ^ ((byte2 >> bit2) & 1);
+
+  // Apply the difference to toggle the bits
+  byte1 ^= (bitDiff << bit1);
+  byte2 ^= (bitDiff << bit2);
+}
+
+void partitionBitsInPlace(
+    Byte* bits,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    const BufferPtr& endPartitionOffsets,
+    velox::memory::MemoryPool* pool) {
+  initializeCursorPartitionOffsets(
+      ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
+
+  auto rawCursorOffsets =
+      ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
+  auto rawEndOffsets = endPartitionOffsets->asMutable<vector_size_t>();
+
+  for (uint32_t partition = 0; partition < numPartitions; partition++) {
+    auto& offset = rawCursorOffsets[partition];
+    auto endOffset = rawEndOffsets[partition];
+    while (offset < endOffset) {
+      uint32_t p = partitions[offset];
+      while (p != partition) {
+        vector_size_t destinationOffset = rawCursorOffsets[p]++;
+
+        // Calculate the byte address and bit index within the byte for the
+        // source and destination bits. Since each byte contains 8 bits, we
+        // divide the offset by 8 to get the byte address and take the modulus
+        // by 8 to get the bit index within that byte.
+        vector_size_t destinationAddr = destinationOffset >> 3;
+        int8_t destinationBitInByte = destinationOffset & 7;
+        vector_size_t fromAddr = offset >> 3;
+        int8_t fromBitInByte = offset & 7;
+
+        swapBit(
+            bits[destinationAddr],
+            destinationBitInByte,
+            bits[fromAddr],
+            fromBitInByte);
+        p = partitions[destinationOffset];
+      }
+      offset = ++rawCursorOffsets[partition];
+    }
+  }
+}
+
+template <TypeKind typeKind>
+PartitionedVectorPtr createPartitionedFlatVector(
+    VectorPtr vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  using T = typename TypeTraits<typeKind>::NativeType;
+  auto flatVector = std::dynamic_pointer_cast<FlatVector<T>>(vector);
+  VELOX_CHECK_NOT_NULL(flatVector);
+
+  auto partitionedFlatVector = std::make_shared<PartitionedFlatVector<T>>(
+      flatVector, numPartitions, endPartitionOffsets, pool);
+
+  if (numPartitions > 1) {
+    partitionedFlatVector->partition(partitions, ctx);
+  }
+
+  return partitionedFlatVector;
+}
+
+} // namespace
+
+PartitionedVector::~PartitionedVector() = default;
+
+PartitionedVectorPtr PartitionedVector::create(
+    const VectorPtr& vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_CHECK_NOT_NULL(vector);
+  VELOX_CHECK_EQ(vector->size(), partitions.size());
+  VELOX_CHECK_GT(numPartitions, 0);
+  VELOX_CHECK_NOT_NULL(pool);
+
+  // Calculate the end offsets for each partition. For example, if there are 3
+  // partitions with 2, 3, and 1 rows respectively, then endPartitionOffsets[0]
+  // = 2, endPartitionOffsets[1] = 5, and endPartitionOffsets[2] = 6.
+  BufferPtr endPartitionOffsets;
+  ensureCapacity<vector_size_t>(endPartitionOffsets, numPartitions, pool);
+  calculateOffsets(
+      partitions,
+      numPartitions,
+      endPartitionOffsets->asMutable<vector_size_t>());
+  endPartitionOffsets->setSize(numPartitions * sizeof(vector_size_t));
+
+  auto raw = endPartitionOffsets->as<vector_size_t>();
+  VELOX_DCHECK_EQ(raw[numPartitions - 1], partitions.size());
+
+  return create(
+      vector, partitions, numPartitions, endPartitionOffsets, ctx, pool);
+}
+
+PartitionedVectorPtr PartitionedVector::create(
+    const VectorPtr& vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_CHECK_NOT_NULL(endPartitionOffsets);
+  VELOX_CHECK_EQ(
+      endPartitionOffsets->size(), numPartitions * sizeof(vector_size_t));
+
+  auto encoding = vector->encoding();
+  auto typeKind = vector->typeKind();
+
+  switch (encoding) {
+    case VectorEncoding::Simple::FLAT: {
+      auto partitionedFlatVector = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+          createPartitionedFlatVector,
+          typeKind,
+          vector,
+          partitions,
+          numPartitions,
+          endPartitionOffsets,
+          ctx,
+          pool);
+      return partitionedFlatVector;
+    }
+
+    case VectorEncoding::Simple::ROW:
+    case VectorEncoding::Simple::ARRAY:
+    case VectorEncoding::Simple::MAP:
+    case VectorEncoding::Simple::DICTIONARY:
+    case VectorEncoding::Simple::BIASED:
+    case VectorEncoding::Simple::SEQUENCE:
+    case VectorEncoding::Simple::CONSTANT:
+    case VectorEncoding::Simple::LAZY:
+      VELOX_UNSUPPORTED(
+          "Unsupported vector encoding for PartitionedVector: {}",
+          mapSimpleToName(encoding));
+    default:
+      VELOX_UNREACHABLE(
+          "Invalid vector encoding for PartitionedVector: {}", encoding);
+  }
+}
+
+VectorPtr PartitionedVector::baseVector() const {
+  return vector_;
+}
+
+std::string PartitionedVector::toString() const {
+  std::string offsets;
+  for (vector_size_t i = 0; i < numPartitions_; ++i) {
+    if (i > 0) {
+      offsets += ',';
+    }
+    offsets += fmt::format("{}", rawEndPartitionOffsets_[i]);
+  }
+
+  return fmt::format(
+      "PartitionedVector[numPartitions: {}, offsets: {}]",
+      numPartitions_,
+      offsets);
+}
+
+template <typename T>
+void PartitionedFlatVector<T>::partition(
+    const std::vector<uint32_t>& partitions,
+    PartitionBuildContext& ctx) {
+  Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
+  if (rawNulls) {
+    partitionBitsInPlace(
+        rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
+  }
+
+  auto valuesBuffer = vector_->as<FlatVector<T>>()->values();
+  partitionFixedWidthValues<T>(
+      valuesBuffer,
+      partitions,
+      endPartitionOffsets_,
+      numPartitions_,
+      ctx,
+      pool_);
+}
+
+template <typename T>
+VectorPtr PartitionedFlatVector<T>::partitionAt(uint32_t partition) const {
+  VELOX_CHECK_LT(partition, numPartitions_);
+
+  vector_size_t beginOffset =
+      partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+  vector_size_t numRowsInPartition =
+      rawEndPartitionOffsets_[partition] - beginOffset;
+
+  return vector_->slice(beginOffset, numRowsInPartition);
+}
+
+} // namespace facebook::velox
diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h
new file mode 100644
index 00000000000..8c0983813e9
--- /dev/null
+++ b/velox/vector/PartitionedVector.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <vector>
+
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/ComplexVector.h"
+
+namespace facebook::velox {
+
+class PartitionedVector;
+using PartitionedVectorPtr = std::shared_ptr<PartitionedVector>;
+
+namespace {
+
+// TODO: This was copied from dwio::common::BufferUtil.h. However the vector
+// module should not depend on dwio. Move this to a common place
+template <typename T>
+inline void ensureCapacity(
+    BufferPtr& data,
+    size_t numElements,
+    velox::memory::MemoryPool* pool,
+    bool preserveOldData = false,
+    bool clearBits = false) {
+  size_t oldSize = 0;
+  size_t newCapacity = BaseVector::byteSize<T>(numElements);
+  if (!data) {
+    data = AlignedBuffer::allocate<T>(numElements, pool);
+  } else {
+    oldSize = data->size();
+    if (!data->isMutable() || data->capacity() < newCapacity) {
+      auto newData = AlignedBuffer::allocate<T>(numElements, pool);
+      if (preserveOldData) {
+        std::memcpy(
+            newData->template asMutable<uint8_t>(),
+            data->as<uint8_t>(),
+            oldSize);
+      }
+      data = newData;
+    }
+  }
+
+  if (clearBits && newCapacity > oldSize) {
+    std::memset(
+        (void*)(data->asMutable<int8_t>() + oldSize),
+        0L,
+        newCapacity - oldSize);
+  }
+}
+
+} // namespace
+
+/// Construction-time context used to build a PartitionedVector.
+///
+/// This struct contains only transient execution context needed during
+/// construction. None of the fields here define the logical state of
+/// PartitionedVector and none are retained after create().
+/// All fields are only valid during the PartitionedVector::create() call.
+struct PartitionBuildContext {
+  BufferPtr cursorPartitionOffsets = nullptr;
+
+  PartitionBuildContext() = default;
+};
+
+/// PartitionedVector provides an in-place, partition-aware layout of a vector
+/// based on per-row partition IDs.
+///
+/// This is a low-level execution abstraction, analogous to DecodedVector:
+/// - it owns partitioning metadata (offsets, indices)
+/// - it does not encode operator-specific semantics
+/// - it is intended to be reused by multiple exec components
+///   (aggregation, sorting, shuffle, etc.)
+///
+/// The partitioning operation rearranges rows so that rows belonging to the
+/// same partition occupy a contiguous range.
+///
+/// Thread-safety:
+///   This class is NOT thread-safe. All methods must be called from a single
+///   thread. Internal buffers are mutated during create().
+class PartitionedVector {
+ public:
+  /// Disable default constructor.
+  PartitionedVector() = delete;
+
+  /// Disable copy constructor and assignment.
+  PartitionedVector(const PartitionedVector& other) = delete;
+  PartitionedVector& operator=(const PartitionedVector& other) = delete;
+
+  // Use default move constructor and move assignment operator.
+  PartitionedVector(PartitionedVector&&) noexcept = default;
+  PartitionedVector& operator=(PartitionedVector&&) noexcept = default;
+
+  /// Virtual destructor.
+  virtual ~PartitionedVector();
+
+  /// Factory method to create a PartitionedVector. This is the main entry point
+  /// for constructing a PartitionedVector. The partitioning operation
+  /// rearranges rows in the base vector so that rows belonging to the same
+  /// partition occupy a contiguous range.
+  ///
+  /// Params:
+  /// - vector: the base vector to be partitioned. This is modified during
+  ///   partitioning, and becomes the underlying vector of the created
+  ///   PartitionedVector.
+  /// - partitions: a vector of partition IDs for each row in the base vector.
+  ///   The length of this vector must be the same as the number of rows in the
+  ///   base vector. Each entry must be a value between 0 and numPartitions - 1.
+  /// - numPartitions: the total number of partitions. This must be greater than
+  ///   0.
+  /// - ctx: the context object for building the partitioned vector. This
+  ///   contains transient execution context needed during construction, such as
+  ///   intermediate buffers. None of the fields in this context define the
+  ///   logical state of the PartitionedVector, and none are retained after
+  ///   create(). All fields in this context are only valid during the create()
+  ///   call.
+  /// - pool: the memory pool for allocating any necessary buffers during the
+  ///   creation of the PartitionedVector.
+  static PartitionedVectorPtr create(
+      const VectorPtr& vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions,
+      PartitionBuildContext& ctx,
+      velox::memory::MemoryPool* pool);
+
+  /// Returns the underlying vector.
+  VectorPtr baseVector() const;
+
+  /// Returns the partitioned vector at partition p. If the number of rows in
+  /// that partition is 0, returns an empty vector.
+  virtual VectorPtr partitionAt(uint32_t partition) const = 0;
+
+  template <typename T>
+  T* as() {
+    static_assert(std::is_base_of_v<PartitionedVector, T>);
+    return dynamic_cast<T*>(this);
+  }
+
+  TypeKind typeKind() const {
+    return vector_->typeKind();
+  }
+
+  vector_size_t* rawPartitionOffsets() {
+    return rawEndPartitionOffsets_;
+  }
+
+  virtual const vector_size_t* rawSizes() = 0;
+
+  /// Returns string representation of the value in the specified row.
+  virtual std::string toString() const;
+
+ protected:
+  // Internal create method that accepts pre-computed endPartitionOffsets
+  // buffer.
+  static PartitionedVectorPtr create(
+      const VectorPtr& vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsetsBuffer,
+      PartitionBuildContext& ctx,
+      velox::memory::MemoryPool* pool);
+
+  PartitionedVector(
+      const VectorPtr& vector,
+      uint32_t numPartitions,
+      const BufferPtr& endPartitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : vector_(vector),
+        numPartitions_(numPartitions),
+        endPartitionOffsets_(endPartitionOffsets),
+        pool_(pool) {
+    VELOX_CHECK_NOT_NULL(vector_);
+    VELOX_CHECK_GT(numPartitions_, 0);
+    VELOX_CHECK_NOT_NULL(endPartitionOffsets_);
+    VELOX_CHECK_EQ(
+        endPartitionOffsets_->size(), numPartitions_ * sizeof(vector_size_t));
+    VELOX_CHECK_NOT_NULL(pool_);
+
+    rawEndPartitionOffsets_ = endPartitionOffsets_->asMutable<vector_size_t>();
+  }
+
+  virtual void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) = 0;
+
+  // The base vector that is being partitioned. This is modified during
+  // partitioning.
+  VectorPtr vector_;
+
+  // Total number of partitions. This is set at construction and does not change
+  // during partitioning. It doesn't have const quantifier because we want to
+  // allow move assignment operator.
+  uint32_t numPartitions_;
+
+  // The cumulative end row offsets for each partition. For example, if there
+  // are 3 partitions with 2, 3, and 1 rows respectively, then
+  // endPartitionOffsets_[0] = 2, endPartitionOffsets_[1] = 5, and
+  // endPartitionOffsets_[2] = 6.
+  BufferPtr endPartitionOffsets_;
+
+  // The raw pointer to the endPartitionOffsets_ buffer for easy access during
+  // partitioning.
+  vector_size_t* rawEndPartitionOffsets_;
+
+  velox::memory::MemoryPool* pool_;
+};
+
+using PartitionedVectorPtr = std::shared_ptr<PartitionedVector>;
+
+template <typename T>
+class PartitionedFlatVector : public PartitionedVector {
+ public:
+  PartitionedFlatVector(
+      const VectorPtr& flatVector,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : PartitionedVector(flatVector, numPartitions, partitionOffsets, pool) {}
+
+  void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) override;
+
+  VectorPtr partitionAt(uint32_t partition) const override;
+
+  const vector_size_t* rawSizes() override {
+    VELOX_UNREACHABLE("PartitionedFlatVector does not implement rawSizes()");
+  }
+};
+
+} // namespace facebook::velox
diff --git a/velox/vector/tests/CMakeLists.txt b/velox/vector/tests/CMakeLists.txt
index 24478b9c8e5..08277820124 100644
--- a/velox/vector/tests/CMakeLists.txt
+++ b/velox/vector/tests/CMakeLists.txt
@@ -25,6 +25,7 @@ add_executable(
   LazyVectorTest.cpp
   MapConcatTest.cpp
   MayHaveNullsRecursiveTest.cpp
+  PartitionedVectorTest.cpp
   SelectivityVectorTest.cpp
   StringVectorBufferTest.cpp
   VariantToVectorTest.cpp
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
new file mode 100644
index 00000000000..df5b586ec6a
--- /dev/null
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <iostream>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include "vector/tests/utils/VectorTestBase.h"
+#include "velox/vector/PartitionedVector.h"
+#include "velox/vector/tests/utils/PartitionedVectorTestBase.h"
+
+namespace facebook::velox::test {
+
+class PartitioningVectorTest : public testing::TestWithParam<int>,
+                               public test::PartitionedVectorTestBase {
+ protected:
+  std::mt19937 gen_ = std::mt19937(std::random_device{}());
+
+  PartitionBuildContext ctx_;
+  BufferPtr partitionOffsets_;
+
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance({});
+  }
+
+  void testPartitionedVector(
+      VectorPtr vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions) {
+    // Back up the vector before calling PartitionedVector::create()
+    VectorPtr vectorCopy = BaseVector::copy(*vector);
+    // Build the expected vector using the reference implementation
+    std::vector<VectorPtr> expectedVectors =
+        partitionVectorByWrapping(vectorCopy, partitions, numPartitions);
+
+    // Initialize buffers needed for PartitionedVector::create()
+    ensureCapacity<vector_size_t>(
+        ctx_.cursorPartitionOffsets, numPartitions, pool_.get());
+
+    // Calculate the number of values for each partition
+    std::vector<vector_size_t> partitionRowCounts(numPartitions, 0);
+    for (auto partition : partitions) {
+      partitionRowCounts[partition]++;
+    }
+
+    // Create the partitioned vector using the actual implementation
+    auto partitionedVector = PartitionedVector::create(
+        vector,
+        partitions,
+        numPartitions,
+        //        partitionOffsets_,
+        ctx_,
+        pool_.get());
+    VELOX_CHECK_NOT_NULL(partitionedVector);
+
+    // Extract each partition and compare with expected results
+    std::vector<VectorPtr> partitionedVectors;
+    for (uint32_t i = 0; i < numPartitions; ++i) {
+      auto partition = partitionedVector->partitionAt(i);
+      partitionedVectors.push_back(partition);
+    }
+
+    for (uint32_t i = 0; i < numPartitions; ++i) {
+      test::assertEqualVectors(
+          expectedVectors[i], canonicalize(partitionedVectors[i]));
+    }
+  }
+
+  void testVectorPartitioning(VectorPtr vector) {
+    auto numRows = vector->size();
+    std::vector<uint32_t> partitions(numRows);
+
+    // Test with single partition
+    std::fill(partitions.begin(), partitions.end(), 0);
+    auto vectorCopy = BaseVector::copy(*vector, pool_.get());
+    testPartitionedVector(vectorCopy, partitions, 1);
+
+    // Test with two partitions
+    if (vector->size() >= 3) {
+      for (uint32_t i = 0; i < partitions.size(); ++i) {
+        partitions[i] = i % 2;
+      }
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, 2);
+    }
+
+    // Test with three partitions
+    for (uint32_t i = 0; i < partitions.size(); ++i) {
+      partitions[i] = i % 3;
+    }
+    vectorCopy = BaseVector::copy(*vector, pool_.get());
+    testPartitionedVector(vectorCopy, partitions, 3);
+
+    if (vector->size() > 4) {
+      // Test with four partitions where the first partition is empty
+      for (uint32_t i = 0; i < partitions.size(); ++i) {
+        partitions[i] = i % 3 + 1;
+      }
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, 4);
+
+      // Test with four partitions where the last partition is empty
+      for (uint32_t i = 0; i < partitions.size(); ++i) {
+        partitions[i] = i % 3;
+      }
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, 4);
+    }
+
+    // Test with one value per partition
+    if (vector->size() > 0) {
+      std::iota(partitions.begin(), partitions.end(), 0);
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, numRows);
+    }
+
+    // Test with random partitions (number of partitions <= number of values)
+    std::uniform_int_distribution<> dis(0, numRows - 1);
+    uint32_t maxPartition = 0;
+    for (uint32_t i = 0; i < numRows; ++i) {
+      partitions[i] = dis(gen_);
+      maxPartition = std::max(maxPartition, partitions[i]);
+    }
+    vectorCopy = BaseVector::copy(*vector, pool_.get());
+    testPartitionedVector(vectorCopy, partitions, maxPartition + 1);
+  }
+};
+
+TEST_P(PartitioningVectorTest, testFlatVector) {
+  // Number of values in the vector to be partitioned. This is passed as a test
+  // parameter and is used to test different vector sizes, including edge cases
+  // like 0 and 1.
+  const int numValues = GetParam();
+
+  // Random values, no nulls
+  testVectorPartitioning(
+      makeFlatVector<int>(numValues, [](auto row) { return row; }));
+
+  // Random values, with half number of nulls
+  testVectorPartitioning(
+      makeFlatVector<int>(
+          numValues, [](auto row) { return row; }, nullEvery(2, 1)));
+
+  // All nulls
+  testVectorPartitioning(makeAllNullFlatVector<int>(numValues));
+}
+
+// Test with different vector sizes, including edge cases like 0 and 1.
+INSTANTIATE_TEST_SUITE_P(
+    FlatVectorSizes,
+    PartitioningVectorTest,
+    ::testing::Values(0, 1, 10, 10000));
+
+} // namespace facebook::velox::test
diff --git a/velox/vector/tests/utils/CMakeLists.txt b/velox/vector/tests/utils/CMakeLists.txt
index 9e7fbae65b6..35a56901ccf 100644
--- a/velox/vector/tests/utils/CMakeLists.txt
+++ b/velox/vector/tests/utils/CMakeLists.txt
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-add_library(velox_vector_test_lib VectorMaker.cpp VectorTestBase.cpp)
+add_library(velox_vector_test_lib PartitionedVectorTestBase.cpp VectorMaker.cpp VectorTestBase.cpp)
 velox_add_test_headers(
   velox_vector_test_lib
   VectorMaker-inl.h
diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.cpp b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
new file mode 100644
index 00000000000..6c939dfb569
--- /dev/null
+++ b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/vector/tests/utils/PartitionedVectorTestBase.h"
+
+namespace facebook::velox::test {
+
+VectorPtr PartitionedVectorTestBase::canonicalize(VectorPtr vector) {
+  auto numRows = vector->size();
+
+  auto indices = makeIndices(numRows, [&](auto row) { return row; });
+  vector_size_t* indicesRange = indices->asMutable<vector_size_t>();
+
+  // Sort the indices based on the vector values
+  std::stable_sort(
+      indicesRange,
+      indicesRange + numRows,
+      [&](vector_size_t left, vector_size_t right) {
+        return vector->compare(vector.get(), left, right) < 0;
+      });
+
+  auto sortedVector = wrapInDictionary(indices, numRows, vector);
+  return sortedVector;
+}
+
+std::vector<VectorPtr> PartitionedVectorTestBase::partitionVectorByWrapping(
+    VectorPtr vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions) {
+  auto numRows = vector->size();
+
+  // Count the number of rows in each partition
+  std::vector<uint32_t> partitionRowCounts(numPartitions, 0);
+  for (int i = 0; i < numRows; i++) {
+    partitionRowCounts[partitions[i]]++;
+  }
+
+  std::vector<VectorPtr> partitionedVectors(numPartitions, nullptr);
+
+  for (int p = 0; p < numPartitions; p++) {
+    auto numRowsInPartition = partitionRowCounts[p];
+
+    if (numRowsInPartition == 0) {
+      partitionedVectors[p] =
+          BaseVector::create(vector->type(), 0, pool_.get());
+      continue;
+    }
+
+    // Create an indices buffer for each partition, and fill it with the row
+    // indices for that partition.
+    std::vector<vector_size_t> rowIdsInPartition(numRowsInPartition);
+    vector_size_t offset = 0;
+    for (vector_size_t i = 0; i < numRows; ++i) {
+      if (partitions[i] == p) {
+        VELOX_DCHECK_LT(offset, numRowsInPartition);
+        rowIdsInPartition[offset++] = i;
+      }
+    }
+    VELOX_CHECK_EQ(offset, numRowsInPartition);
+    auto indices = makeIndices(partitionRowCounts[p], [&](auto row) {
+      return rowIdsInPartition[row];
+    });
+
+    // Simulate partitioning by building the DictionaryVector with the
+    // partitioned indices
+    // Copy firsts because wrapInDictionary would take the ownership of the
+    // vector
+    VectorPtr vectorCopy = BaseVector::copy(*vector, pool_.get());
+    auto dictionaryVector = BaseVector::wrapInDictionary(
+        nullptr, indices, numRowsInPartition, vectorCopy);
+    partitionedVectors[p] = canonicalize(dictionaryVector);
+  }
+  return partitionedVectors;
+}
+
+std::vector<VectorPtr> PartitionedVectorTestBase::partitionRowVectors(
+    const std::vector<RowVectorPtr>& rowVectors,
+    int32_t numPartitions,
+    core::PartitionFunction* partitionFunction) {
+  //  RowVectorPtr mergedRowVector = mergeRowVectors(rowVectors);
+  VectorPtr mergedRowVector =
+      mergeVectors((const std::vector<VectorPtr>&)rowVectors);
+  auto totalNumRows = mergedRowVector->size();
+
+  std::vector<uint32_t> partitions(totalNumRows, 0);
+  if (numPartitions > 1) {
+    auto rowType = asRowType(mergedRowVector->type());
+    //    auto partitionFunction = createPartitionFunction(rowType, {0});
+    partitionFunction->partition(*mergedRowVector->as<RowVector>(), partitions);
+  }
+
+  std::vector<VectorPtr> partitionedVectors =
+      partitionVectorByWrapping(mergedRowVector, partitions, numPartitions);
+
+  for (auto& vector : partitionedVectors) {
+    vector = canonicalize(vector);
+  }
+  return partitionedVectors;
+}
+
+VectorPtr PartitionedVectorTestBase::mergeVectors(
+    const std::vector<VectorPtr>& vectors) {
+  // We have to count the total number of rows first in order to allocate the
+  // mergedRowVector.
+  auto mergedVector = BaseVector::copy(*vectors[0]);
+  for (auto i = 1; i < vectors.size(); ++i) {
+    mergedVector->append(vectors[i].get());
+  }
+
+  return mergedVector;
+}
+
+} // namespace facebook::velox::test
diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.h b/velox/vector/tests/utils/PartitionedVectorTestBase.h
new file mode 100644
index 00000000000..b2c50761edc
--- /dev/null
+++ b/velox/vector/tests/utils/PartitionedVectorTestBase.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/core/PlanNode.h"
+#include "velox/vector/PartitionedVector.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+namespace facebook::velox::test {
+
+class PartitionedVectorTestBase : public VectorTestBase {
+ protected:
+  std::vector<VectorPtr> partitionVectorByWrapping(
+      VectorPtr vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions);
+
+  std::vector<VectorPtr> partitionRowVectors(
+      const std::vector<RowVectorPtr>& rowVectors,
+      int32_t numPartitions,
+      core::PartitionFunction* partitionFunction);
+
+  VectorPtr canonicalize(VectorPtr vector);
+
+  VectorPtr mergeVectors(const std::vector<VectorPtr>& vectors);
+};
+
+} // namespace facebook::velox::test

From 3853bf648ce8f361a0b3245aa469c63e8d0f7f8f Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Thu, 5 Mar 2026 07:15:03 -0800
Subject: [PATCH 02/24] feat: Add PartitionedRowVector implementation

Signed-off-by: Xin Zhang <xin-zhang2@ibm.com>

Alchemy-item: (ID = 1167) Add PartitionedRowVector commit 1/1 - f2af427191ae48de9e2b65b4d6ef6e3525673435
---
 velox/vector/PartitionedVector.cpp           | 79 +++++++++++++++++++-
 velox/vector/PartitionedVector.h             | 27 +++++++
 velox/vector/tests/PartitionedVectorTest.cpp | 42 +++++++++++
 3 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index 43fb7fb5d53..0999e59b351 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -211,6 +211,26 @@ PartitionedVectorPtr createPartitionedFlatVector(
   return partitionedFlatVector;
 }
 
+PartitionedVectorPtr createPartitionedRowVector(
+    VectorPtr vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  VELOX_CHECK_NOT_NULL(rowVector);
+
+  auto partitionedRowVector = std::make_shared<PartitionedRowVector>(
+      rowVector, numPartitions, endPartitionOffsets, pool);
+
+  // Always call partition() to initialize partitionedChildren_, even when
+  // numPartitions == 1, so that partitionAt() can reconstruct the RowVector.
+  partitionedRowVector->partition(partitions, ctx);
+
+  return partitionedRowVector;
+}
+
 } // namespace
 
 PartitionedVector::~PartitionedVector() = default;
@@ -272,7 +292,11 @@ PartitionedVectorPtr PartitionedVector::create(
       return partitionedFlatVector;
     }
 
-    case VectorEncoding::Simple::ROW:
+    case VectorEncoding::Simple::ROW: {
+      return createPartitionedRowVector(
+          vector, partitions, numPartitions, endPartitionOffsets, ctx, pool);
+    }
+
     case VectorEncoding::Simple::ARRAY:
     case VectorEncoding::Simple::MAP:
     case VectorEncoding::Simple::DICTIONARY:
@@ -340,4 +364,57 @@ VectorPtr PartitionedFlatVector<T>::partitionAt(uint32_t partition) const {
   return vector_->slice(beginOffset, numRowsInPartition);
 }
 
+void PartitionedRowVector::partition(
+    const std::vector<uint32_t>& partitions,
+    PartitionBuildContext& ctx) {
+  auto* rowVector = vector_->as<RowVector>();
+  partitionedChildren_.reserve(rowVector->childrenSize());
+
+  for (const auto& child : rowVector->children()) {
+    partitionedChildren_.push_back(PartitionedVector::create(
+        child, partitions, numPartitions_, endPartitionOffsets_, ctx, pool_));
+  }
+
+  if (numPartitions_ > 1) {
+    Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
+    if (rawNulls) {
+      partitionBitsInPlace(
+          rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
+    }
+  }
+}
+
+VectorPtr PartitionedRowVector::partitionAt(uint32_t partition) const {
+  VELOX_CHECK_LT(partition, numPartitions_);
+
+  vector_size_t beginOffset =
+      partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+  vector_size_t numRowsInPartition =
+      rawEndPartitionOffsets_[partition] - beginOffset;
+
+  std::vector<VectorPtr> children;
+  children.reserve(partitionedChildren_.size());
+  for (const auto& child : partitionedChildren_) {
+    children.push_back(child->partitionAt(partition));
+  }
+
+  BufferPtr nulls = nullptr;
+  if (numRowsInPartition > 0 && vector_->rawNulls()) {
+    nulls = AlignedBuffer::allocate<bool>(numRowsInPartition, pool_);
+    bits::copyBits(
+        vector_->rawNulls(),
+        beginOffset,
+        nulls->asMutable<uint64_t>(),
+        0,
+        numRowsInPartition);
+  }
+
+  return std::make_shared<RowVector>(
+      pool_,
+      vector_->type(),
+      std::move(nulls),
+      numRowsInPartition,
+      std::move(children));
+}
+
 } // namespace facebook::velox
diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h
index 8c0983813e9..2d7d67adda8 100644
--- a/velox/vector/PartitionedVector.h
+++ b/velox/vector/PartitionedVector.h
@@ -241,4 +241,31 @@ class PartitionedFlatVector : public PartitionedVector {
   }
 };
 
+/// Partitions a RowVector in-place so that rows belonging to the same
+/// partition occupy a contiguous range. Recursively partitions each child
+/// column using PartitionedVector.
+class PartitionedRowVector : public PartitionedVector {
+ public:
+  PartitionedRowVector(
+      const VectorPtr& rowVector,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : PartitionedVector(rowVector, numPartitions, partitionOffsets, pool) {}
+
+  void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) override;
+
+  VectorPtr partitionAt(uint32_t partition) const override;
+
+  const vector_size_t* rawSizes() override {
+    VELOX_UNREACHABLE("PartitionedRowVector does not implement rawSizes()");
+  }
+
+ private:
+  /// Partitioned child columns, one per child of the underlying RowVector.
+  std::vector<PartitionedVectorPtr> partitionedChildren_;
+};
+
 } // namespace facebook::velox
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
index df5b586ec6a..19043a3145c 100644
--- a/velox/vector/tests/PartitionedVectorTest.cpp
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -159,6 +159,48 @@ TEST_P(PartitioningVectorTest, testFlatVector) {
   testVectorPartitioning(makeAllNullFlatVector<int>(numValues));
 }
 
+TEST_P(PartitioningVectorTest, testRowVector) {
+  const int numValues = GetParam();
+
+  // Two flat columns, no nulls at any level.
+  testVectorPartitioning(makeRowVector({
+      makeFlatVector<int32_t>(numValues, [](auto row) { return row; }),
+      makeFlatVector<int64_t>(numValues, [](auto row) { return row * 10; }),
+  }));
+
+  // Two flat columns with nullable children.
+  testVectorPartitioning(makeRowVector({
+      makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(2)),
+      makeFlatVector<int64_t>(
+          numValues, [](auto row) { return row * 10; }, nullEvery(3)),
+  }));
+
+  // Row-level nulls with no child nulls.
+  testVectorPartitioning(makeRowVector(
+      {makeFlatVector<int32_t>(numValues, [](auto row) { return row; })},
+      nullEvery(2)));
+
+  // Row-level nulls combined with nullable children.
+  testVectorPartitioning(makeRowVector(
+      {makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(3))},
+      nullEvery(2)));
+
+  // All rows null.
+  testVectorPartitioning(makeRowVector(
+      {makeFlatVector<int32_t>(numValues, [](auto row) { return row; })},
+      [](auto /*row*/) { return true; }));
+
+  // Nested RowVector.
+  testVectorPartitioning(makeRowVector({
+      makeFlatVector<int32_t>(numValues, [](auto row) { return row; }),
+      makeRowVector({
+          makeFlatVector<int64_t>(numValues, [](auto row) { return row; }),
+      }),
+  }));
+}
+
 // Test with different vector sizes, including edge cases like 0 and 1.
 INSTANTIATE_TEST_SUITE_P(
     FlatVectorSizes,

From ff2e34b3b35311e72377ac4446cea592a86f44af Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Tue, 10 Mar 2026 12:09:19 +0000
Subject: [PATCH 03/24] refactor: Move initializeCursorPartitionOffsets into
 partitionFixedWidthValuesInPlace

---
 velox/vector/PartitionedVector.cpp | 31 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index 0999e59b351..59a9131dfe1 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -87,26 +87,31 @@ void partitionFixedWidthValuesInPlace(
     T* values,
     const std::vector<uint32_t>& partitions,
     uint32_t numPartitions,
-    vector_size_t* cursorPartitionOffsets,
-    const vector_size_t* endPartitionOffsets) {
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
   VELOX_DCHECK_NOT_NULL(values);
-  VELOX_DCHECK_NOT_NULL(cursorPartitionOffsets);
   VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+  initializeCursorPartitionOffsets(
+      ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
+  auto* rawCursorOffsets =
+      ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
+  const auto* rawEndOffsets = endPartitionOffsets->as<vector_size_t>();
 
   for (auto currentPartition = 0; currentPartition < numPartitions;
        currentPartition++) {
-    vector_size_t& offset = cursorPartitionOffsets[currentPartition];
-    vector_size_t endOffset = endPartitionOffsets[currentPartition];
+    auto& offset = rawCursorOffsets[currentPartition];
+    auto endOffset = rawEndOffsets[currentPartition];
 
     while (offset < endOffset) {
       uint32_t targetPartition = partitions[offset];
 
       while (targetPartition != currentPartition) {
-        auto destinationOffset = cursorPartitionOffsets[targetPartition]++;
+        auto destinationOffset = rawCursorOffsets[targetPartition]++;
         std::swap(values[destinationOffset], values[offset]);
         targetPartition = partitions[destinationOffset];
       }
-      offset = ++cursorPartitionOffsets[currentPartition];
+      offset = ++rawCursorOffsets[currentPartition];
     }
   }
 }
@@ -120,20 +125,10 @@ void partitionFixedWidthValues(
     PartitionBuildContext& ctx,
     velox::memory::MemoryPool* pool) {
   VELOX_DCHECK_NOT_NULL(inputBuffer);
-  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
 
   auto input = inputBuffer->asMutable<T>();
-
-  initializeCursorPartitionOffsets(
-      ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
-
-  vector_size_t* rawCursorOffsets =
-      ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
-  const vector_size_t* rawEndOffsets =
-      endPartitionOffsets->asMutable<vector_size_t>();
-
   partitionFixedWidthValuesInPlace<T>(
-      input, partitions, numPartitions, rawCursorOffsets, rawEndOffsets);
+      input, partitions, numPartitions, endPartitionOffsets, ctx, pool);
 }
 
 // Swap two bits between two bytes

From 875c92c715df8a5a617430690471a662e91597ef Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Tue, 10 Mar 2026 12:01:55 +0000
Subject: [PATCH 04/24] fix: Add bool specialization for
 partitionFixedWidthValues

---
 velox/vector/PartitionedVector.cpp           | 66 ++++++++++++++------
 velox/vector/tests/PartitionedVectorTest.cpp | 16 +++++
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index 59a9131dfe1..e5806620feb 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -116,21 +116,6 @@ void partitionFixedWidthValuesInPlace(
   }
 }
 
-template <typename T>
-void partitionFixedWidthValues(
-    BufferPtr& inputBuffer,
-    const std::vector<uint32_t>& partitions,
-    const BufferPtr& endPartitionOffsets,
-    uint32_t numPartitions,
-    PartitionBuildContext& ctx,
-    velox::memory::MemoryPool* pool) {
-  VELOX_DCHECK_NOT_NULL(inputBuffer);
-
-  auto input = inputBuffer->asMutable<T>();
-  partitionFixedWidthValuesInPlace<T>(
-      input, partitions, numPartitions, endPartitionOffsets, ctx, pool);
-}
-
 // Swap two bits between two bytes
 void swapBit(Byte& byte1, BitIndex bit1, Byte& byte2, BitIndex bit2) {
   // Calculate the difference between the bits
@@ -151,9 +136,9 @@ void partitionBitsInPlace(
   initializeCursorPartitionOffsets(
       ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
 
-  auto rawCursorOffsets =
+  auto* rawCursorOffsets =
       ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
-  auto rawEndOffsets = endPartitionOffsets->asMutable<vector_size_t>();
+  const auto* rawEndOffsets = endPartitionOffsets->as<vector_size_t>();
 
   for (uint32_t partition = 0; partition < numPartitions; partition++) {
     auto& offset = rawCursorOffsets[partition];
@@ -184,6 +169,36 @@ void partitionBitsInPlace(
   }
 }
 
+template <typename T>
+void partitionFixedWidthValues(
+    BufferPtr& inputBuffer,
+    const std::vector<uint32_t>& partitions,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(inputBuffer);
+
+  auto input = inputBuffer->asMutable<T>();
+  partitionFixedWidthValuesInPlace<T>(
+      input, partitions, numPartitions, endPartitionOffsets, ctx, pool);
+}
+
+template <>
+void partitionFixedWidthValues<bool>(
+    BufferPtr& inputBuffer,
+    const std::vector<uint32_t>& partitions,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(inputBuffer);
+
+  auto input = inputBuffer->asMutable<Byte>();
+  partitionBitsInPlace(
+      input, partitions, numPartitions, ctx, endPartitionOffsets, pool);
+}
+
 template <TypeKind typeKind>
 PartitionedVectorPtr createPartitionedFlatVector(
     VectorPtr vector,
@@ -366,15 +381,26 @@ void PartitionedRowVector::partition(
   partitionedChildren_.reserve(rowVector->childrenSize());
 
   for (const auto& child : rowVector->children()) {
-    partitionedChildren_.push_back(PartitionedVector::create(
-        child, partitions, numPartitions_, endPartitionOffsets_, ctx, pool_));
+    partitionedChildren_.push_back(
+        PartitionedVector::create(
+            child,
+            partitions,
+            numPartitions_,
+            endPartitionOffsets_,
+            ctx,
+            pool_));
   }
 
   if (numPartitions_ > 1) {
     Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
     if (rawNulls) {
       partitionBitsInPlace(
-          rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
+          rawNulls,
+          partitions,
+          numPartitions_,
+          ctx,
+          endPartitionOffsets_,
+          pool_);
     }
   }
 }
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
index 19043a3145c..f35f42e0218 100644
--- a/velox/vector/tests/PartitionedVectorTest.cpp
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -159,6 +159,22 @@ TEST_P(PartitioningVectorTest, testFlatVector) {
   testVectorPartitioning(makeAllNullFlatVector<int>(numValues));
 }
 
+TEST_P(PartitioningVectorTest, testFlatBoolVector) {
+  const int numValues = GetParam();
+
+  // Random values, no nulls
+  testVectorPartitioning(
+      makeFlatVector<bool>(numValues, [](auto row) { return row % 2 == 0; }));
+
+  // Random values, with half number of nulls
+  testVectorPartitioning(
+      makeFlatVector<bool>(
+          numValues, [](auto row) { return row % 2 == 0; }, nullEvery(2, 1)));
+
+  // All nulls
+  testVectorPartitioning(makeAllNullFlatVector<bool>(numValues));
+}
+
 TEST_P(PartitioningVectorTest, testRowVector) {
   const int numValues = GetParam();
 

From 281a365ff3bdd025602e1d40614a1e7c431d625a Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Thu, 12 Mar 2026 06:19:10 -0700
Subject: [PATCH 05/24] fix: Avoid allocating null buffer when partitioning
 null-free vectors

PartitionedFlatVector::partition() and PartitionedRowVector::partition()
called mutableRawNulls() unconditionally. mutableRawNulls() allocates a
null buffer if one does not exist, causing mayHaveNulls() to return true
for every vector after partitioning, even when the original had no nulls.

Fix both sites to check rawNulls() first and only call mutableRawNulls()
when a null buffer already exists.

Add noNullBufferAllocatedForNullFreeFlat and
noNullBufferAllocatedForNullFreeRow tests to PartitionedVectorTest to
cover this case.

# Conflicts:
#	velox/vector/PartitionedVector.cpp
---
 velox/vector/PartitionedVector.cpp           | 12 ++---
 velox/vector/tests/PartitionedVectorTest.cpp | 51 ++++++++++++++++++++
 2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index e5806620feb..7713f8ecfd2 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -346,8 +346,8 @@ template <typename T>
 void PartitionedFlatVector<T>::partition(
     const std::vector<uint32_t>& partitions,
     PartitionBuildContext& ctx) {
-  Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
-  if (rawNulls) {
+  if (vector_->rawNulls()) {
+    Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
     partitionBitsInPlace(
         rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
   }
@@ -391,17 +391,15 @@ void PartitionedRowVector::partition(
             pool_));
   }
 
-  if (numPartitions_ > 1) {
+  if (numPartitions_ > 1 && vector_->rawNulls()) {
     Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
-    if (rawNulls) {
-      partitionBitsInPlace(
-          rawNulls,
+    partitionBitsInPlace(
+        rawNulls,
           partitions,
           numPartitions_,
           ctx,
           endPartitionOffsets_,
           pool_);
-    }
   }
 }
 
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
index f35f42e0218..f87c9514d4e 100644
--- a/velox/vector/tests/PartitionedVectorTest.cpp
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -217,6 +217,57 @@ TEST_P(PartitioningVectorTest, testRowVector) {
   }));
 }
 
+// Partitioning a null-free vector must not allocate a null buffer.
+TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeFlat) {
+  const int numValues = GetParam();
+  if (numValues == 0) {
+    return;
+  }
+
+  auto flat = makeFlatVector<int32_t>(numValues, [](auto row) { return row; });
+  ASSERT_FALSE(flat->mayHaveNulls());
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 2;
+  }
+
+  auto pv = PartitionedVector::create(flat, partitions, 2, ctx_, pool_.get());
+  EXPECT_FALSE(pv->baseVector()->mayHaveNulls())
+      << "partition() must not allocate a null buffer for a null-free FlatVector";
+}
+
+// Partitioning a null-free RowVector must not allocate null buffers on the
+// row vector or any of its children.
+TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeRow) {
+  const int numValues = GetParam();
+  if (numValues == 0) {
+    return;
+  }
+
+  auto row = makeRowVector({
+      makeFlatVector<int32_t>(numValues, [](auto row) { return row; }),
+      makeFlatVector<int64_t>(numValues, [](auto row) { return row * 10; }),
+  });
+  ASSERT_FALSE(row->mayHaveNulls());
+  ASSERT_FALSE(row->childAt(0)->mayHaveNulls());
+  ASSERT_FALSE(row->childAt(1)->mayHaveNulls());
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 2;
+  }
+
+  auto pv = PartitionedVector::create(row, partitions, 2, ctx_, pool_.get());
+  auto* base = pv->baseVector()->as<RowVector>();
+  EXPECT_FALSE(base->mayHaveNulls())
+      << "partition() must not allocate a null buffer for a null-free RowVector";
+  EXPECT_FALSE(base->childAt(0)->mayHaveNulls())
+      << "partition() must not allocate a null buffer for null-free child 0";
+  EXPECT_FALSE(base->childAt(1)->mayHaveNulls())
+      << "partition() must not allocate a null buffer for null-free child 1";
+}
+
 // Test with different vector sizes, including edge cases like 0 and 1.
 INSTANTIATE_TEST_SUITE_P(
     FlatVectorSizes,

From 6519a8f1dbc2c19e332642333db0999eacd1ffe0 Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Fri, 13 Mar 2026 11:18:57 +0000
Subject: [PATCH 06/24] feat: Add ParitionedConstantVector implementation

---
 velox/vector/PartitionedVector.cpp           | 28 +++++++++++++++-----
 velox/vector/PartitionedVector.h             | 27 +++++++++++++++++++
 velox/vector/tests/PartitionedVectorTest.cpp | 11 ++++++++
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index 7713f8ecfd2..1bee4029a66 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -307,12 +307,16 @@ PartitionedVectorPtr PartitionedVector::create(
           vector, partitions, numPartitions, endPartitionOffsets, ctx, pool);
     }
 
+    case VectorEncoding::Simple::CONSTANT: {
+      return std::make_shared<PartitionedConstantVector>(
+          vector, numPartitions, endPartitionOffsets, pool);
+    }
+
     case VectorEncoding::Simple::ARRAY:
     case VectorEncoding::Simple::MAP:
     case VectorEncoding::Simple::DICTIONARY:
     case VectorEncoding::Simple::BIASED:
     case VectorEncoding::Simple::SEQUENCE:
-    case VectorEncoding::Simple::CONSTANT:
     case VectorEncoding::Simple::LAZY:
       VELOX_UNSUPPORTED(
           "Unsupported vector encoding for PartitionedVector: {}",
@@ -394,12 +398,7 @@ void PartitionedRowVector::partition(
   if (numPartitions_ > 1 && vector_->rawNulls()) {
     Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
     partitionBitsInPlace(
-        rawNulls,
-          partitions,
-          numPartitions_,
-          ctx,
-          endPartitionOffsets_,
-          pool_);
+        rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
   }
 }
 
@@ -436,4 +435,19 @@ VectorPtr PartitionedRowVector::partitionAt(uint32_t partition) const {
       std::move(children));
 }
 
+void PartitionedConstantVector::partition(
+    const std::vector<uint32_t>& /*partitions*/,
+    PartitionBuildContext& /*ctx*/) {}
+
+VectorPtr PartitionedConstantVector::partitionAt(uint32_t partition) const {
+  VELOX_CHECK_LT(partition, numPartitions_);
+
+  const vector_size_t beginOffset =
+      partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+  const vector_size_t numRowsInPartition =
+      rawEndPartitionOffsets_[partition] - beginOffset;
+
+  return vector_->slice(0, numRowsInPartition);
+}
+
 } // namespace facebook::velox
diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h
index 2d7d67adda8..c1c417e92a6 100644
--- a/velox/vector/PartitionedVector.h
+++ b/velox/vector/PartitionedVector.h
@@ -268,4 +268,31 @@ class PartitionedRowVector : public PartitionedVector {
   std::vector<PartitionedVectorPtr> partitionedChildren_;
 };
 
+/// Partitions a ConstantVector by reusing the same constant payload and
+/// returning constant slices sized to each partition.
+class PartitionedConstantVector : public PartitionedVector {
+ public:
+  PartitionedConstantVector(
+      const VectorPtr& constantVector,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : PartitionedVector(
+            constantVector,
+            numPartitions,
+            partitionOffsets,
+            pool) {}
+
+  void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) override;
+
+  VectorPtr partitionAt(uint32_t partition) const override;
+
+  const vector_size_t* rawSizes() override {
+    VELOX_UNREACHABLE(
+        "PartitionedConstantVector does not implement rawSizes()");
+  }
+};
+
 } // namespace facebook::velox
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
index f87c9514d4e..4a16f5130ba 100644
--- a/velox/vector/tests/PartitionedVectorTest.cpp
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -217,6 +217,17 @@ TEST_P(PartitioningVectorTest, testRowVector) {
   }));
 }
 
+TEST_P(PartitioningVectorTest, testConstantVector) {
+  const int numValues = GetParam();
+
+  testVectorPartitioning(makeConstant<int32_t>(7, numValues));
+  testVectorPartitioning(makeConstant<int32_t>(std::nullopt, numValues));
+  testVectorPartitioning(makeConstantRow(
+      ROW({"c0", "c1"}, {INTEGER(), VARCHAR()}),
+      variant::row({variant(11), variant("constant")}),
+      numValues));
+}
+
 // Partitioning a null-free vector must not allocate a null buffer.
 TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeFlat) {
   const int numValues = GetParam();

From d8f34b40b751bb54307193475380ca52e3611ec9 Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Wed, 4 Mar 2026 10:19:15 +0000
Subject: [PATCH 07/24] Add PartitionedVector benchmark

---
 velox/vector/benchmarks/CMakeLists.txt        |  10 +
 .../benchmarks/PartitionedVectorBenchmark.cpp | 184 ++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 velox/vector/benchmarks/PartitionedVectorBenchmark.cpp

diff --git a/velox/vector/benchmarks/CMakeLists.txt b/velox/vector/benchmarks/CMakeLists.txt
index 0cb3c78bfd8..8c1840daa1b 100644
--- a/velox/vector/benchmarks/CMakeLists.txt
+++ b/velox/vector/benchmarks/CMakeLists.txt
@@ -45,3 +45,13 @@ target_link_libraries(
   gflags::gflags
   glog::glog
 )
+
+add_executable(velox_vector_partitioned_vector_benchmark PartitionedVectorBenchmark.cpp)
+target_link_libraries(
+  velox_vector_partitioned_vector_benchmark
+  velox_dwio_common_test_utils
+  velox_vector
+  velox_vector_test_lib
+  Folly::folly
+  Folly::follybenchmark
+)
diff --git a/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
new file mode 100644
index 00000000000..681a2e0c188
--- /dev/null
+++ b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <absl/random/uniform_int_distribution.h>
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include <algorithm>
+
+#include "dwio/common/tests/utils/BatchMaker.h"
+#include "vector/PartitionedVector.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::test;
+
+namespace facebook::velox::test {
+
+namespace {
+
+thread_local auto gen = std::mt19937(42);
+
+auto noNulls = [](vector_size_t) { return false; };
+
+auto allNulls = [](vector_size_t) { return true; };
+
+auto halfNulls = [](vector_size_t row) { return row % 2 == 0; };
+
+template <TypeKind T>
+RowTypePtr scalarTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, createScalarType<T>()));
+}
+
+RowTypePtr dateTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, DATE()));
+}
+
+RowTypePtr shortDecimalTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, DECIMAL(10, 2)));
+}
+
+RowTypePtr longDecimalTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, DECIMAL(20, 3)));
+}
+
+RowTypePtr mixedFlatTypeGenerator(int32_t numColumns) {
+  const std::vector<TypePtr> typeSelection = {
+      BOOLEAN(),
+      TINYINT(),
+      SMALLINT(),
+      INTEGER(),
+      BIGINT(),
+      HUGEINT(),
+      REAL(),
+      DOUBLE(),
+      TIMESTAMP(),
+      DATE(),
+      DECIMAL(10, 2),
+      DECIMAL(20, 3),
+  };
+
+  std::vector<TypePtr> types;
+  types.reserve(numColumns);
+
+  for (int i = 0; i < numColumns; ++i) {
+    types.push_back(typeSelection[i % typeSelection.size()]);
+  }
+
+  std::ranges::shuffle(types, gen);
+
+  return ROW(std::move(types));
+}
+
+auto randomPartitionFunction = [](const RowVectorPtr& vector,
+                                  uint32_t numPartitions,
+                                  std::vector<uint32_t>& partitions) {
+  partitions.resize(vector->size());
+  for (int i = 0; i < vector->size(); ++i) {
+    partitions[i] = gen() % numPartitions;
+  }
+};
+
+std::shared_ptr<memory::MemoryPool> pool;
+std::vector<uint32_t> partitions;
+
+RowVectorPtr createTestVector(
+    const std::function<RowTypePtr(int32_t)>& rowTypeGenerator,
+    vector_size_t numRows,
+    int32_t numColumns,
+    const std::function<bool(vector_size_t)>& isNullAt) {
+  auto rowType = rowTypeGenerator(numColumns);
+  const auto batch = BatchMaker::createBatch(rowType, numRows, *pool, isNullAt);
+  return std::static_pointer_cast<RowVector>(batch);
+}
+
+} // namespace
+
+void runBM(
+    uint32_t iterations,
+    const std::function<RowTypePtr(int32_t)>& rowTypeGenerator,
+    int32_t numColumns,
+    uint32_t numPartitions,
+    const std::function<bool(vector_size_t)>& isNullAt = noNulls,
+    vector_size_t numRows = 10000) {
+  folly::BenchmarkSuspender suspender;
+  PartitionBuildContext ctx;
+  auto vector =
+      createTestVector(rowTypeGenerator, numRows, numColumns, isNullAt);
+  randomPartitionFunction(vector, numPartitions, partitions);
+  for (uint32_t i = 0; i < iterations; ++i) {
+    // PartitionedVector::create mutates its input, so each iteration needs a
+    // fresh copy to keep inputs consistent.
+    const auto vectorCopy = std::static_pointer_cast<RowVector>(
+        BaseVector::copy(*vector, pool.get()));
+    suspender.dismiss();
+    PartitionedVector::create(
+        vectorCopy, partitions, numPartitions, ctx, pool.get());
+    suspender.rehire();
+  }
+}
+
+#define BENCHMARK_CONFIG(name, generator, numCols, nulls, numParts) \
+  BENCHMARK_NAMED_PARAM(                                            \
+      runBM,                                                        \
+      name##_##numCols##Cols_##nulls##_P##numParts,                 \
+      generator,                                                    \
+      numCols,                                                      \
+      numParts,                                                     \
+      nulls);
+
+#define BENCHMARK_PARTITIONS(name, generator, numCols, nulls) \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 4)        \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 16)       \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 64)       \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 256)      \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 1024)
+
+#define BENCHMARK_SIZES(name, generator, nulls)     \
+  BENCHMARK_PARTITIONS(name, generator, 1, nulls)   \
+  BENCHMARK_PARTITIONS(name, generator, 10, nulls)  \
+  BENCHMARK_PARTITIONS(name, generator, 100, nulls) \
+  BENCHMARK_PARTITIONS(name, generator, 1000, nulls)
+
+#define BENCHMARK_TYPE(name, generator)      \
+  BENCHMARK_SIZES(name, generator, noNulls)  \
+  BENCHMARK_SIZES(name, generator, allNulls) \
+  BENCHMARK_SIZES(name, generator, halfNulls)
+
+BENCHMARK_TYPE(BOOLEAN, scalarTypeGenerator<TypeKind::BOOLEAN>);
+BENCHMARK_TYPE(SMALLINT, scalarTypeGenerator<TypeKind::SMALLINT>);
+BENCHMARK_TYPE(INTEGER, scalarTypeGenerator<TypeKind::INTEGER>);
+BENCHMARK_TYPE(BIGINT, scalarTypeGenerator<TypeKind::BIGINT>);
+BENCHMARK_TYPE(HUGEINT, scalarTypeGenerator<TypeKind::HUGEINT>);
+BENCHMARK_TYPE(REAL, scalarTypeGenerator<TypeKind::REAL>);
+BENCHMARK_TYPE(DOUBLE, scalarTypeGenerator<TypeKind::DOUBLE>);
+BENCHMARK_TYPE(TIMESTAMP, scalarTypeGenerator<TypeKind::TIMESTAMP>);
+BENCHMARK_TYPE(VARCHAR, scalarTypeGenerator<TypeKind::VARCHAR>);
+BENCHMARK_TYPE(VARBINARY, scalarTypeGenerator<TypeKind::VARBINARY>);
+BENCHMARK_TYPE(DATE, dateTypeGenerator);
+BENCHMARK_TYPE(ShortDecimal, shortDecimalTypeGenerator);
+BENCHMARK_TYPE(LongDecimal, longDecimalTypeGenerator);
+BENCHMARK_TYPE(Mixed, mixedFlatTypeGenerator);
+
+} // namespace facebook::velox::test
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  pool = memory::memoryManager()->addLeafPool();
+  folly::runBenchmarks();
+  return 0;
+}

From 9eafc9d8904079ea44c4401a90bb7912a8be1bf4 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Fri, 20 Mar 2026 05:32:26 -0700
Subject: [PATCH 08/24] feat(PartitionedOutput): Add numNullsPerPartition_ to
 PartitionedVector

---
 velox/vector/PartitionedVector.cpp           |  30 ++++-
 velox/vector/PartitionedVector.h             |  16 +++
 velox/vector/tests/PartitionedVectorTest.cpp | 130 ++++++++++++++++++-
 3 files changed, 172 insertions(+), 4 deletions(-)

diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index 1bee4029a66..bc83840aa9c 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -214,9 +214,9 @@ PartitionedVectorPtr createPartitionedFlatVector(
   auto partitionedFlatVector = std::make_shared<PartitionedFlatVector<T>>(
       flatVector, numPartitions, endPartitionOffsets, pool);
 
-  if (numPartitions > 1) {
-    partitionedFlatVector->partition(partitions, ctx);
-  }
+  // Always call partition() so that numNullsPerPartition_ is populated,
+  // even when numPartitions == 1 and no data movement is required.
+  partitionedFlatVector->partition(partitions, ctx);
 
   return partitionedFlatVector;
 }
@@ -364,6 +364,18 @@ void PartitionedFlatVector<T>::partition(
       numPartitions_,
       ctx,
       pool_);
+
+  // Count nulls per partition from the now-partitioned null bitmap.
+  if (const uint64_t* rawNulls = vector_->rawNulls()) {
+    for (uint32_t p = 0; p < numPartitions_; ++p) {
+      const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1];
+      const vector_size_t end = rawEndPartitionOffsets_[p];
+      if (begin < end) {
+        numNullsPerPartition_[p] =
+            static_cast<vector_size_t>(bits::countNulls(rawNulls, begin, end));
+      }
+    }
+  }
 }
 
 template <typename T>
@@ -400,6 +412,18 @@ void PartitionedRowVector::partition(
     partitionBitsInPlace(
         rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
   }
+
+  // Count nulls per partition from the now-partitioned null bitmap.
+  if (const uint64_t* rawNulls = vector_->rawNulls()) {
+    for (uint32_t p = 0; p < numPartitions_; ++p) {
+      const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1];
+      const vector_size_t end = rawEndPartitionOffsets_[p];
+      if (begin < end) {
+        numNullsPerPartition_[p] =
+            static_cast<vector_size_t>(bits::countNulls(rawNulls, begin, end));
+      }
+    }
+  }
 }
 
 VectorPtr PartitionedRowVector::partitionAt(uint32_t partition) const {
diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h
index c1c417e92a6..eb008f1193b 100644
--- a/velox/vector/PartitionedVector.h
+++ b/velox/vector/PartitionedVector.h
@@ -149,6 +149,12 @@ class PartitionedVector {
     return dynamic_cast<T*>(this);
   }
 
+  /// Returns the number of null rows in the given partition.
+  vector_size_t numNullsAt(uint32_t partition) const {
+    VELOX_DCHECK_LT(partition, numPartitions_);
+    return numNullsPerPartition_[partition];
+  }
+
   TypeKind typeKind() const {
     return vector_->typeKind();
   }
@@ -181,6 +187,7 @@ class PartitionedVector {
       : vector_(vector),
         numPartitions_(numPartitions),
         endPartitionOffsets_(endPartitionOffsets),
+        numNullsPerPartition_(numPartitions, 0),
         pool_(pool) {
     VELOX_CHECK_NOT_NULL(vector_);
     VELOX_CHECK_GT(numPartitions_, 0);
@@ -215,6 +222,9 @@ class PartitionedVector {
   // partitioning.
   vector_size_t* rawEndPartitionOffsets_;
 
+  /// Null row counts per partition, computed during partition().
+  std::vector<vector_size_t> numNullsPerPartition_;
+
   velox::memory::MemoryPool* pool_;
 };
 
@@ -259,6 +269,12 @@ class PartitionedRowVector : public PartitionedVector {
 
   VectorPtr partitionAt(uint32_t partition) const override;
 
+  /// Returns the partitioned child vector at the given column index.
+  PartitionedVectorPtr childAt(uint32_t col) const {
+    VELOX_DCHECK_LT(col, partitionedChildren_.size());
+    return partitionedChildren_[col];
+  }
+
   const vector_size_t* rawSizes() override {
     VELOX_UNREACHABLE("PartitionedRowVector does not implement rawSizes()");
   }
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
index 4a16f5130ba..569a6e6ae9f 100644
--- a/velox/vector/tests/PartitionedVectorTest.cpp
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include <algorithm>
-#include <iostream>
 #include <random>
 
 #include <gtest/gtest.h>
@@ -279,6 +278,135 @@ TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeRow) {
       << "partition() must not allocate a null buffer for null-free child 1";
 }
 
+// numNullsAt() tests
+// ---------------------------------------------------------------------------
+
+// A null-free flat vector must report zero nulls for every partition.
+TEST_P(PartitioningVectorTest, numNullsAtFlatNoNulls) {
+  const int numValues = GetParam();
+  auto flat = makeFlatVector<int32_t>(numValues, [](auto row) { return row; });
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get());
+  for (uint32_t p = 0; p < 3; ++p) {
+    EXPECT_EQ(pv->numNullsAt(p), 0) << "partition " << p;
+  }
+}
+
+// A flat vector with every other row null must report the exact per-partition
+// null count. The sum across all partitions must equal the total null count.
+TEST_P(PartitioningVectorTest, numNullsAtFlatSomeNulls) {
+  const int numValues = GetParam();
+  auto flat = makeFlatVector<int32_t>(
+      numValues, [](auto row) { return row; }, nullEvery(2));
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get());
+
+  // Per-partition counts must agree with manual bit-scan of the base vector.
+  const auto* rawNulls = pv->baseVector()->rawNulls();
+  const auto* rawOffsets = pv->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1];
+    const vector_size_t end = rawOffsets[p];
+    const vector_size_t expected = rawNulls
+        ? BaseVector::countNulls(pv->baseVector()->nulls(), begin, end)
+        : 0;
+    EXPECT_EQ(pv->numNullsAt(p), expected) << "partition " << p;
+  }
+
+  // Sum across partitions must equal the total null count in the source vector.
+  const vector_size_t total =
+      pv->numNullsAt(0) + pv->numNullsAt(1) + pv->numNullsAt(2);
+  EXPECT_EQ(total, BaseVector::countNulls(flat->nulls(), 0, numValues));
+}
+
+// An all-null flat vector must report numNullsAt(p) == rows in that partition.
+TEST_P(PartitioningVectorTest, numNullsAtFlatAllNulls) {
+  const int numValues = GetParam();
+  auto flat = makeAllNullFlatVector<int32_t>(numValues);
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get());
+
+  const auto* rawOffsets = pv->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1];
+    const vector_size_t numRowsInPartition = rawOffsets[p] - begin;
+    EXPECT_EQ(pv->numNullsAt(p), numRowsInPartition) << "partition " << p;
+  }
+}
+
+// A row vector with no row-level nulls must report zero per-partition nulls at
+// the row level, even when child columns have nulls.
+TEST_P(PartitioningVectorTest, numNullsAtRowNoRowLevelNulls) {
+  const int numValues = GetParam();
+  auto row = makeRowVector({
+      makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(2)),
+  });
+  ASSERT_FALSE(row->mayHaveNulls());
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(row, partitions, 3, ctx_, pool_.get());
+  for (uint32_t p = 0; p < 3; ++p) {
+    EXPECT_EQ(pv->numNullsAt(p), 0)
+        << "Row-level numNullsAt() must not count child nulls, partition " << p;
+  }
+}
+
+// A row vector with row-level nulls must report per-partition counts that match
+// a manual bit-scan. Child null counts must be counted independently.
+TEST_P(PartitioningVectorTest, numNullsAtRowRowLevelNulls) {
+  const int numValues = GetParam();
+  auto row = makeRowVector(
+      {makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(3))},
+      nullEvery(2));
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(row, partitions, 3, ctx_, pool_.get());
+
+  const auto* rawOffsets = pv->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1];
+    const vector_size_t end = rawOffsets[p];
+    const vector_size_t expected =
+        BaseVector::countNulls(pv->baseVector()->nulls(), begin, end);
+    EXPECT_EQ(pv->numNullsAt(p), expected)
+        << "Row-level null count mismatch, partition " << p;
+  }
+
+  // Child null counts must be tracked independently of row-level nulls.
+  auto* prv = dynamic_cast<PartitionedRowVector*>(pv.get());
+  ASSERT_NE(prv, nullptr);
+  auto child = prv->childAt(0);
+  const auto* childOffsets = child->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : childOffsets[p - 1];
+    const vector_size_t end = childOffsets[p];
+    const vector_size_t expected =
+        BaseVector::countNulls(child->baseVector()->nulls(), begin, end);
+    EXPECT_EQ(child->numNullsAt(p), expected)
+        << "Child null count mismatch, partition " << p;
+  }
+}
+
 // Test with different vector sizes, including edge cases like 0 and 1.
 INSTANTIATE_TEST_SUITE_P(
     FlatVectorSizes,

From 6f09ea9e45dc7095a0fa4dd247ea83bddc16fcaf Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Mon, 23 Mar 2026 02:32:26 -0700
Subject: [PATCH 09/24] feat(PartitionedOutput): Add
 PrestoIterativePartitioningSerializer

This commit introduces PrestoIterativePartitioningSerializer, which
buffers RowVectors across multiple append() calls, partitions rows
in-place using PartitionedVector, and on flush() serializes each
non-empty partition into a Presto wire-format IOBuf. The serializer has
no dependency on velox_exec: it returns raw folly::IOBuf objects,
leaving SerializedPage creation to the caller.
---
 velox/serializers/CMakeLists.txt              |   1 +
 .../PrestoIterativePartitioningSerializer.cpp | 732 ++++++++++++++++++
 .../PrestoIterativePartitioningSerializer.h   | 164 ++++
 velox/serializers/benchmarks/CMakeLists.txt   |  14 +
 ...erativePartitioningSerializerBenchmark.cpp | 177 +++++
 velox/serializers/tests/CMakeLists.txt        |   2 +
 ...stoIterativePartitioningSerializerTest.cpp | 661 ++++++++++++++++
 7 files changed, 1751 insertions(+)
 create mode 100644 velox/serializers/PrestoIterativePartitioningSerializer.cpp
 create mode 100644 velox/serializers/PrestoIterativePartitioningSerializer.h
 create mode 100644 velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
 create mode 100644 velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp

diff --git a/velox/serializers/CMakeLists.txt b/velox/serializers/CMakeLists.txt
index c5227f763ff..366b043aeb1 100644
--- a/velox/serializers/CMakeLists.txt
+++ b/velox/serializers/CMakeLists.txt
@@ -29,6 +29,7 @@ velox_add_library(
   UnsafeRowSerializer.cpp
   PrestoBatchVectorSerializer.cpp
   PrestoHeader.cpp
+  PrestoIterativePartitioningSerializer.cpp
   PrestoIterativeVectorSerializer.cpp
   PrestoSerializerDeserializationUtils.cpp
   PrestoSerializerEstimationUtils.cpp
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
new file mode 100644
index 00000000000..88e7e7f9a5d
--- /dev/null
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+
+#include "velox/common/base/BitUtil.h"
+#include "velox/type/Type.h"
+#include "velox/vector/ComplexVector.h"
+#include "velox/vector/FlatVector.h"
+
+namespace facebook::velox::serializer::presto {
+
+namespace {
+
+constexpr int8_t kCheckSumBitMask = 4;
+constexpr int64_t kVectorSizeTypeSize{sizeof(vector_size_t)};
+// [numRows:4][codec:1]
+constexpr int64_t kUncompressedSizeOffset{kVectorSizeTypeSize + 1};
+// [numRows:4][codec:1][uncompressedSize:4][compressedSize:4][checksum:8]
+constexpr int64_t kHeaderSize{kUncompressedSizeOffset + 4 + 4 + 8};
+
+static inline const std::string_view kByteArray{"BYTE_ARRAY"};
+static inline const std::string_view kShortArray{"SHORT_ARRAY"};
+static inline const std::string_view kIntArray{"INT_ARRAY"};
+static inline const std::string_view kLongArray{"LONG_ARRAY"};
+static inline const std::string_view kInt128Array{"INT128_ARRAY"};
+static inline const std::string_view kVariableWidth{"VARIABLE_WIDTH"};
+static inline const std::string_view kRow{"ROW"};
+
+inline void writeInt32(OutputStream* out, int32_t value) {
+  out->write(reinterpret_cast<const char*>(&value), sizeof(value));
+}
+
+inline void writeInt64(OutputStream* out, int64_t value) {
+  out->write(reinterpret_cast<const char*>(&value), sizeof(value));
+}
+
+char getCodecMarker() {
+  char marker = 0;
+  marker |= kCheckSumBitMask;
+  return marker;
+}
+
+std::string_view typeToEncodingName(const TypePtr& type) {
+  switch (type->kind()) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+      return kByteArray;
+    case TypeKind::SMALLINT:
+      return kShortArray;
+    case TypeKind::INTEGER:
+    case TypeKind::REAL:
+      return kIntArray;
+    case TypeKind::BIGINT:
+    case TypeKind::DOUBLE:
+    case TypeKind::TIMESTAMP:
+      return kLongArray;
+    case TypeKind::HUGEINT:
+      return kInt128Array;
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      return kVariableWidth;
+    case TypeKind::ROW:
+      return kRow;
+    default:
+      VELOX_FAIL("Unsupported type kind: {}", static_cast<int>(type->kind()));
+  }
+}
+
+/// Finalizes the Presto page CRC by mixing in the codec marker, row count,
+/// and uncompressed size on top of the listener's accumulated data checksum.
+int64_t computeChecksum(
+    PrestoOutputStreamListener& listener,
+    int8_t codecMarker,
+    int32_t numRows,
+    int32_t uncompressedSize) {
+  auto crc = listener.crc();
+  crc.process_bytes(&codecMarker, 1);
+  crc.process_bytes(&numRows, 4);
+  crc.process_bytes(&uncompressedSize, 4);
+  return static_cast<int64_t>(crc.checksum());
+}
+
+/// Returns the serialized byte width of a fixed-width type, matching the
+/// sizeof(T) used in flushFlatValues.
+int32_t fixedTypeWidth(TypeKind kind) {
+  switch (kind) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+      return 1;
+    case TypeKind::SMALLINT:
+      return 2;
+    case TypeKind::INTEGER:
+    case TypeKind::REAL:
+      return 4;
+    case TypeKind::BIGINT:
+    case TypeKind::DOUBLE:
+      return 8;
+    case TypeKind::TIMESTAMP:
+    case TypeKind::HUGEINT:
+      return 16;
+    default:
+      return 0;
+  }
+}
+
+/// Returns the exact bytes for one fixed-width column in one partition.
+int64_t
+simpleColumnBytes(const TypePtr& colType, int64_t numRows, int64_t numNulls) {
+  const auto encodingName = typeToEncodingName(colType);
+  return 4 + static_cast<int64_t>(encodingName.size()) + // header
+      4 + // rowCount
+      1 + // nullFlag
+      (numNulls > 0 ? bits::nbytes(numRows) : 0) + // null bitmap
+      (numRows - numNulls) * fixedTypeWidth(colType->kind()); // values
+}
+
+/// Returns per-partition exact byte counts for one column (all partitions).
+/// Recurses into nested ROW columns.
+///
+/// Byte layout per column type:
+///   Fixed-width: simpleColumnBytes(colType, numRows, numNulls)
+///   ROW:         7 (header) + 4 (numFields)
+///                + sum(child sizes)
+///                + 4 (numRows) + 4*(numRows+1) (offsets) + 1 (hasNulls)
+///                + (rowNulls>0 ? bits::nbytes(numRows) : 0)
+std::vector<int64_t> computeColumnFlushSizes(
+    const std::vector<PartitionedVectorPtr>& columnVectors,
+    const TypePtr& colType,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<vector_size_t>& rowsPerPartition,
+    uint32_t numPartitions) {
+  std::vector<int64_t> sizes(numPartitions, 0);
+
+  // Compute per-partition null counts by summing across batches.
+  std::vector<int64_t> nullCounts(numPartitions, 0);
+  for (uint32_t p : nonEmptyPartitions) {
+    for (const auto& pv : columnVectors) {
+      nullCounts[p] += pv->numNullsAt(p);
+    }
+  }
+
+  switch (colType->kind()) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+    case TypeKind::SMALLINT:
+    case TypeKind::INTEGER:
+    case TypeKind::BIGINT:
+    case TypeKind::REAL:
+    case TypeKind::DOUBLE:
+    case TypeKind::HUGEINT:
+      for (uint32_t p : nonEmptyPartitions) {
+        sizes[p] =
+            simpleColumnBytes(colType, rowsPerPartition[p], nullCounts[p]);
+      }
+      break;
+
+    case TypeKind::TIMESTAMP:
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+    case TypeKind::ARRAY:
+    case TypeKind::MAP:
+      VELOX_NYI(
+          "computeColumnFlushSizes: unsupported type kind {}",
+          TypeKindName::toName(colType->kind()));
+
+    case TypeKind::ROW: {
+      const auto& rowSchema = colType->asRow();
+      const int32_t numFields = static_cast<int32_t>(rowSchema.size());
+
+      // Fixed per-partition overhead: header(7) + numFields(4) + footer:
+      // numRows(4)
+      // + sequential offsets 4*(numRows+1) + hasNulls(1)
+      // + null bitmap for the ROW vector itself if any rows in this partition
+      // are null.
+      for (uint32_t p : nonEmptyPartitions) {
+        const int64_t numRows = rowsPerPartition[p];
+        const int64_t rowNullBitmapBytes =
+            nullCounts[p] > 0 ? bits::nbytes(numRows) : 0;
+        sizes[p] = 7 + 4 + // "ROW" header + numFields
+            4 + 4 * (numRows + 1) + 1 + // footer: numRows + offsets + hasNulls
+            rowNullBitmapBytes;
+      }
+      // Add child column sizes recursively.
+      for (uint32_t col = 0; col < static_cast<uint32_t>(numFields); ++col) {
+        std::vector<PartitionedVectorPtr> childVectors;
+        childVectors.reserve(columnVectors.size());
+        for (const auto& pv : columnVectors) {
+          childVectors.push_back(
+              std::dynamic_pointer_cast<PartitionedRowVector>(pv)->childAt(
+                  col));
+        }
+        const auto childSizes = computeColumnFlushSizes(
+            childVectors,
+            rowSchema.childAt(col),
+            nonEmptyPartitions,
+            rowsPerPartition,
+            numPartitions);
+        for (uint32_t p : nonEmptyPartitions) {
+          sizes[p] += childSizes[p];
+        }
+      }
+      break;
+    }
+
+    default:
+      VELOX_UNSUPPORTED(
+          "computeColumnFlushSizes: unsupported type kind {}",
+          TypeKindName::toName(colType->kind()));
+  }
+  return sizes;
+}
+
+} // namespace
+
+PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer(
+    RowTypePtr inputType,
+    uint32_t numPartitions,
+    const SerdeOpts& opts,
+    memory::MemoryPool* pool)
+    : type_(std::move(inputType)),
+      numPartitions_(numPartitions),
+      opts_(opts),
+      pool_(pool),
+      rowsPerPartition_(numPartitions, 0) {
+  VELOX_CHECK_GT(numPartitions_, 0);
+  VELOX_CHECK_NOT_NULL(pool_);
+
+  numColumns_ = type_->size();
+}
+
+void PrestoIterativePartitioningSerializer::append(
+    const RowVectorPtr& input,
+    const std::vector<uint32_t>& partitions) {
+  VELOX_CHECK_NOT_NULL(input);
+  VELOX_CHECK_EQ(
+      input->size(),
+      partitions.size(),
+      "partitions.size() must equal input->size()");
+
+  if (input->size() == 0) {
+    return;
+  }
+
+  PartitionBuildContext ctx;
+  auto partitionedRowVector = PartitionedVector::create(
+      std::static_pointer_cast<BaseVector>(input),
+      partitions,
+      numPartitions_,
+      ctx,
+      pool_);
+
+  const vector_size_t* partitionOffsets =
+      partitionedRowVector->rawPartitionOffsets();
+  vector_size_t prevOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    rowsPerPartition_[p] += partitionOffsets[p] - prevOffset;
+    prevOffset = partitionOffsets[p];
+  }
+
+  partitionedRowVectors_.push_back(std::move(partitionedRowVector));
+
+  bytesBuffered_ += input->retainedSize();
+  rowsBuffered_ += static_cast<int64_t>(input->size());
+}
+
+// ---------------------------------------------------------------------------
+// Top-level flush
+// ---------------------------------------------------------------------------
+
+std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+PrestoIterativePartitioningSerializer::flush() {
+  auto pages =
+      (opts_.compressionKind == common::CompressionKind::CompressionKind_NONE)
+      ? flushUncompressed()
+      : flushCompressed();
+
+  partitionedRowVectors_.clear();
+  flushSizes_.clear();
+  std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0);
+  bytesBuffered_ = 0;
+  rowsBuffered_ = 0;
+
+  return pages;
+}
+
+std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+PrestoIterativePartitioningSerializer::flushUncompressed() {
+  if (partitionedRowVectors_.empty()) {
+    return {};
+  }
+
+  const char codecMask = getCodecMarker();
+
+  // 1. Determine non-empty partitions.
+  std::vector<uint32_t> nonEmptyPartitions;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    if (rowsPerPartition_[p] > 0) {
+      nonEmptyPartitions.push_back(p);
+    }
+  }
+
+  // 2. Pre-compute exact byte sizes per top-level column and partition.
+  const auto& rowSchema = type_->asRow();
+  flushSizes_.assign(rowSchema.size(), std::vector<int64_t>(numPartitions_, 0));
+  for (uint32_t col = 0; col < rowSchema.size(); ++col) {
+    std::vector<PartitionedVectorPtr> columnVectors;
+    columnVectors.reserve(partitionedRowVectors_.size());
+    for (const auto& pRowVector : partitionedRowVectors_) {
+      columnVectors.push_back(
+          std::dynamic_pointer_cast<PartitionedRowVector>(pRowVector)
+              ->childAt(col));
+    }
+    flushSizes_[col] = computeColumnFlushSizes(
+        columnVectors,
+        rowSchema.childAt(col),
+        nonEmptyPartitions,
+        rowsPerPartition_,
+        numPartitions_);
+  }
+
+  // 3. Create output streams sized to the exact bytes each partition will need,
+  // so that the entire payload fits. This avoids multiple resizing and copying.
+  std::vector<std::unique_ptr<PrestoOutputStreamListener>> listeners(
+      numPartitions_);
+  std::vector<std::unique_ptr<IOBufOutputStream>> outputStreams(numPartitions_);
+  std::vector<IOBufOutputStream*> rawOutputStreams(numPartitions_);
+  std::vector<std::streampos> beginStreamPositions(numPartitions_);
+
+  for (uint32_t p : nonEmptyPartitions) {
+    int64_t initialSize = kHeaderSize + 4; // page header + numCols
+    for (uint32_t col = 0; col < rowSchema.size(); ++col) {
+      initialSize += flushSizes_[col][p];
+    }
+    listeners[p] = std::make_unique<PrestoOutputStreamListener>();
+    outputStreams[p] = std::make_unique<IOBufOutputStream>(
+        *pool_, listeners[p].get(), initialSize);
+    rawOutputStreams[p] = outputStreams[p].get();
+    beginStreamPositions[p] = outputStreams[p]->tellp();
+
+    flushStart(*outputStreams[p], p, codecMask);
+  }
+
+  // 4. Flush column data.
+  flushRowChildren(
+      partitionedRowVectors_, rowSchema, nonEmptyPartitions, rawOutputStreams);
+
+  // 5. Finalize the page by seeking back to fill in sizes and CRC, and get the
+  // IOBuf and numOfRows from each stream.
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+      result;
+  for (uint32_t p : nonEmptyPartitions) {
+    flushFinish(
+        *outputStreams[p],
+        p,
+        beginStreamPositions[p],
+        codecMask,
+        *listeners[p]);
+    result[p] =
+        std::make_pair(outputStreams[p]->getIOBuf(), rowsPerPartition_[p]);
+  }
+
+  return result;
+}
+
+std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+PrestoIterativePartitioningSerializer::flushCompressed() {
+  VELOX_NYI();
+}
+
+// ---------------------------------------------------------------------------
+// Second level functions: start, columns and finish
+// ---------------------------------------------------------------------------
+
+void PrestoIterativePartitioningSerializer::flushStart(
+    IOBufOutputStream& out,
+    uint32_t partition,
+    char codecMask) const {
+  auto* listener = dynamic_cast<PrestoOutputStreamListener*>(out.listener());
+  if (listener) {
+    listener->pause();
+  }
+
+  // Write 21-byte Presto page header; sizes and CRC are filled in later.
+  const int32_t numRows = static_cast<int32_t>(rowsPerPartition_[partition]);
+  char header[kHeaderSize] = {};
+  std::memcpy(&header[0], &numRows, 4);
+  std::memcpy(&header[4], &codecMask, 1);
+  out.write(header, kHeaderSize);
+
+  if (listener) {
+    listener->resume();
+  }
+
+  // Number of columns is included in the CRC.
+  const int32_t numCols = static_cast<int32_t>(numColumns_);
+  out.write(reinterpret_cast<const char*>(&numCols), 4);
+}
+
+void PrestoIterativePartitioningSerializer::flushRowChildren(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const RowType& rowSchema,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  for (uint32_t col = 0; col < rowSchema.size(); ++col) {
+    std::vector<PartitionedVectorPtr> column;
+    column.reserve(partitionedVectors.size());
+    for (const auto& partitionedVector : partitionedVectors) {
+      const auto& partitionedRowVector =
+          std::dynamic_pointer_cast<PartitionedRowVector>(partitionedVector);
+      VELOX_DCHECK_NOT_NULL(partitionedRowVector.get());
+      column.push_back(partitionedRowVector->childAt(col));
+    }
+
+    flushColumn(
+        column, rowSchema.childAt(col), nonEmptyPartitions, outputStreams);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushFinish(
+    IOBufOutputStream& out,
+    uint32_t partition,
+    std::streampos beginOffset,
+    char codecMask,
+    PrestoOutputStreamListener& listener) const {
+  listener.pause();
+
+  const std::streampos totalSize =
+      static_cast<int32_t>(out.tellp() - beginOffset);
+  const std::streampos uncompressedSize = totalSize - kHeaderSize;
+  const int64_t crc = computeChecksum(
+      listener,
+      static_cast<int8_t>(codecMask),
+      static_cast<int32_t>(rowsPerPartition_[partition]),
+      uncompressedSize);
+
+  out.seekp(beginOffset + kUncompressedSizeOffset);
+  writeInt32(&out, uncompressedSize);
+  writeInt32(&out, uncompressedSize); // TODO: compressedSize
+  writeInt64(&out, crc);
+  out.seekp(beginOffset + totalSize);
+}
+
+// ---------------------------------------------------------------------------
+// Column-level dispatch
+// ---------------------------------------------------------------------------
+
+void PrestoIterativePartitioningSerializer::flushColumn(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const TypePtr& colType,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  VELOX_CHECK_GT(partitionedVectors.size(), 0);
+
+  auto typeKind = partitionedVectors[0]->baseVector()->typeKind();
+  switch (typeKind) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+    case TypeKind::SMALLINT:
+    case TypeKind::INTEGER:
+    case TypeKind::BIGINT:
+    case TypeKind::REAL:
+    case TypeKind::DOUBLE:
+    case TypeKind::HUGEINT:
+      flushSimpleColumn(
+          partitionedVectors, colType, nonEmptyPartitions, outputStreams);
+      break;
+
+    case TypeKind::TIMESTAMP:
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+    case TypeKind::ROW:
+    case TypeKind::ARRAY:
+    case TypeKind::MAP:
+      VELOX_NYI();
+
+    default:
+      VELOX_UNSUPPORTED(
+          "Invalid vector encoding for PrestoIterativePartitioningSerializer: ",
+          typeKind);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSimpleColumn(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const TypePtr& colType,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  flushHeader(typeToEncodingName(colType), nonEmptyPartitions, outputStreams);
+  flushRowCounts(nonEmptyPartitions, outputStreams);
+  flushNulls(partitionedVectors, nonEmptyPartitions, outputStreams);
+
+  for (size_t i = 0; i < partitionedVectors.size(); i++) {
+    flushSingleSimpleVector(partitionedVectors[i], outputStreams);
+  }
+}
+
+template <TypeKind kind>
+void PrestoIterativePartitioningSerializer::flushSingleFlatVector(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  using T = typename TypeTraits<kind>::NativeType;
+  auto* flatVector = partitionedVector->as<PartitionedFlatVector<T>>();
+  VELOX_DCHECK_NOT_NULL(flatVector);
+
+  const auto* rawValues =
+      flatVector->baseVector()->template as<FlatVector<T>>()->rawValues();
+  const auto* rawNulls = flatVector->baseVector()->rawNulls();
+  const auto* partitionOffsets = flatVector->rawPartitionOffsets();
+
+  flushFlatValues<T>(rawValues, rawNulls, partitionOffsets, outputStreams);
+}
+
+// BOOLEAN columns use kByteArray encoding: FlatVector<bool> stores bits
+// packed, so rawValues() is unsupported. Each non-null value is written as
+// one byte (0x00 or 0x01).
+template <>
+void PrestoIterativePartitioningSerializer::flushSingleFlatVector<
+    TypeKind::BOOLEAN>(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  auto* flatVector = partitionedVector->as<PartitionedFlatVector<bool>>();
+  VELOX_DCHECK_NOT_NULL(flatVector);
+
+  const auto* rawBoolValues =
+      flatVector->baseVector()->as<FlatVector<bool>>()->rawValues<uint64_t>();
+  const auto* rawNulls = flatVector->baseVector()->rawNulls();
+  const auto* partitionOffsets = flatVector->rawPartitionOffsets();
+
+  // TODO: Improve performance
+  vector_size_t lastOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const auto offset = partitionOffsets[p];
+    const auto numValues = offset - lastOffset;
+    const auto numNulls = partitionedVector->numNullsAt(p);
+    if (outputStreams[p] != nullptr && numValues > 0) {
+      if (numNulls == 0) {
+        for (vector_size_t i = lastOffset; i < offset; ++i) {
+          const int8_t val = bits::isBitSet(rawBoolValues, i) ? 1 : 0;
+          outputStreams[p]->write(reinterpret_cast<const char*>(&val), 1);
+        }
+      } else {
+        VELOX_DCHECK_NOT_NULL(rawNulls);
+        for (vector_size_t i = lastOffset; i < offset; ++i) {
+          if (!bits::isBitNull(rawNulls, i)) {
+            const int8_t val = bits::isBitSet(rawBoolValues, i) ? 1 : 0;
+            outputStreams[p]->write(reinterpret_cast<const char*>(&val), 1);
+          }
+        }
+      }
+    }
+    lastOffset = offset;
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSingleSimpleVector(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  auto encoding = partitionedVector->baseVector()->encoding();
+  auto typeKind = partitionedVector->baseVector()->typeKind();
+
+  switch (encoding) {
+    case VectorEncoding::Simple::FLAT:
+      VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+          flushSingleFlatVector, typeKind, partitionedVector, outputStreams);
+      break;
+    case VectorEncoding::Simple::BIASED:
+    case VectorEncoding::Simple::CONSTANT:
+    case VectorEncoding::Simple::DICTIONARY:
+    case VectorEncoding::Simple::SEQUENCE:
+      VELOX_NYI(
+          "Unsupported vector encoding for PrestoIterativePartitioningSerializer: ",
+          encoding);
+    default:
+      VELOX_UNSUPPORTED(
+          "Invalid vector encoding for PrestoIterativePartitioningSerializer:flushSingleSimpleVector ",
+          encoding);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Column building blocks
+// ---------------------------------------------------------------------------
+
+void PrestoIterativePartitioningSerializer::flushHeader(
+    std::string_view name,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  const int32_t nameLen = static_cast<int32_t>(name.size());
+  for (uint32_t p : nonEmptyPartitions) {
+    writeInt32(outputStreams[p], nameLen);
+    outputStreams[p]->write(name.data(), nameLen);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushRowCounts(
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  for (uint32_t p : nonEmptyPartitions) {
+    writeInt32(outputStreams[p], static_cast<int32_t>(rowsPerPartition_[p]));
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushNulls(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  std::vector<vector_size_t> nullCounts(numPartitions_, 0);
+  for (uint32_t p : nonEmptyPartitions) {
+    for (const auto& pv : partitionedVectors) {
+      nullCounts[p] += pv->numNullsAt(p);
+    }
+    const char flagByte = nullCounts[p] > 0 ? 1 : 0;
+    outputStreams[p]->write(&flagByte, 1);
+  }
+
+  const bool hasAnyNulls = std::any_of(
+      nonEmptyPartitions.begin(), nonEmptyPartitions.end(), [&](uint32_t p) {
+        return nullCounts[p] > 0;
+      });
+  if (!hasAnyNulls) {
+    return;
+  }
+
+  // Build each partition's null bitmap in a temporary buffer, accumulating
+  // bits across all batches. Writing via write() correctly handles range
+  // boundaries in the output stream without requiring seekp().
+  // TODO: Avoid this extra memory allocation and copy
+  std::vector<std::vector<uint8_t>> bitmaps(numPartitions_);
+  for (uint32_t p : nonEmptyPartitions) {
+    if (nullCounts[p] > 0) {
+      bitmaps[p].assign(bits::nbytes(rowsPerPartition_[p]), bits::kNotNullByte);
+    }
+  }
+
+  std::vector<vector_size_t> destBitOffsets(numPartitions_, 0);
+  for (const auto& pv : partitionedVectors) {
+    const uint64_t* rawNulls = pv->baseVector()->rawNulls();
+    const auto* partitionOffsets = pv->rawPartitionOffsets();
+
+    vector_size_t startBit = 0;
+    for (uint32_t p : nonEmptyPartitions) {
+      const vector_size_t numBits = partitionOffsets[p] - startBit;
+      if (rawNulls && numBits > 0 && !bitmaps[p].empty()) {
+        bits::copyBits(
+            rawNulls,
+            startBit,
+            reinterpret_cast<uint64_t*>(bitmaps[p].data()),
+            destBitOffsets[p],
+            numBits);
+      }
+      if (!bitmaps[p].empty()) {
+        destBitOffsets[p] += numBits;
+      }
+      startBit = partitionOffsets[p];
+    }
+  }
+
+  for (uint32_t p : nonEmptyPartitions) {
+    if (nullCounts[p] == 0) {
+      continue;
+    }
+
+    // Convert Velox format (LSB-first, 1=not-null) to Presto wire format
+    // (MSB-first, 1=null) in-place.
+    const int32_t numBytes = bits::nbytes(rowsPerPartition_[p]);
+    for (int32_t i = 0; i < numBytes; ++i) {
+      bitmaps[p][i] = ~bitmaps[p][i];
+      bits::reverseBits(&bitmaps[p][i], 1);
+    }
+
+    outputStreams[p]->write(
+        reinterpret_cast<const char*>(bitmaps[p].data()), numBytes);
+  }
+}
+
+template <typename T>
+void PrestoIterativePartitioningSerializer::flushFlatValues(
+    const T* partitionedValues,
+    const uint64_t* rawNulls,
+    const vector_size_t* partitionOffsets,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  const auto typeWidth = sizeof(T);
+  vector_size_t lastOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const auto offset = partitionOffsets[p];
+    const auto numValues = offset - lastOffset;
+    if (outputStreams[p] != nullptr && numValues > 0) {
+      if (!rawNulls) {
+        outputStreams[p]->write(
+            reinterpret_cast<const char*>(&partitionedValues[lastOffset]),
+            numValues * typeWidth);
+      } else {
+        // Presto writes only non-null values; null slots are omitted.
+        // TODO: Improve performance
+        for (vector_size_t i = lastOffset; i < offset; ++i) {
+          if (!bits::isBitNull(rawNulls, i)) {
+            outputStreams[p]->write(
+                reinterpret_cast<const char*>(&partitionedValues[i]),
+                typeWidth);
+          }
+        }
+      }
+    }
+    lastOffset = offset;
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSequentialOffsets(
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  for (uint32_t p : nonEmptyPartitions) {
+    const int32_t numRows = static_cast<int32_t>(rowsPerPartition_[p]);
+    for (int32_t i = 0; i <= numRows; ++i) {
+      writeInt32(outputStreams[p], i);
+    }
+  }
+}
+
+} // namespace facebook::velox::serializer::presto
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h
new file mode 100644
index 00000000000..b9e41286ea6
--- /dev/null
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include <folly/io/IOBuf.h>
+
+#include "velox/common/memory/ByteStream.h"
+#include "velox/serializers/PrestoSerializer.h"
+#include "velox/type/Type.h"
+#include "velox/vector/PartitionedVector.h"
+
+namespace facebook::velox::serializer::presto {
+
+/// Convenience alias matching PrestoSerializer.cpp convention.
+using SerdeOpts = PrestoVectorSerde::PrestoOptions;
+
+/// Serializes a stream of RowVectors into per-partition Presto pages.
+///
+/// Each call to append() routes rows to their assigned partition. flush()
+/// produces one Presto-format IOBuf per non-empty partition and resets the
+/// internal state so the serializer can be reused for the next cycle.
+class PrestoIterativePartitioningSerializer {
+ public:
+  PrestoIterativePartitioningSerializer(
+      RowTypePtr inputType,
+      uint32_t numPartitions,
+      const SerdeOpts& opts,
+      memory::MemoryPool* pool);
+
+  /// Routes each row in `input` to the partition indicated by `partitions`.
+  /// `partitions.size()` must equal `input->size()`.
+  void append(
+      const RowVectorPtr& input,
+      const std::vector<uint32_t>& partitions);
+
+  /// Serializes all buffered data into one Presto page per non-empty partition
+  /// and resets internal state. Returns an empty map if nothing has been
+  /// appended since the last flush.
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+  flush();
+
+  /// Returns the total retained bytes of all appended input vectors.
+  int64_t bytesBuffered() const {
+    return bytesBuffered_;
+  }
+
+  /// Returns the total number of rows appended since the last flush.
+  int64_t rowsBuffered() const {
+    return rowsBuffered_;
+  }
+
+  /// Returns the number of rows buffered for the given partition.
+  /// Must be called before flush(), which resets per-partition counts.
+  int64_t rowsPerPartition(uint32_t partition) const {
+    VELOX_DCHECK_LT(partition, numPartitions_);
+    return rowsPerPartition_[partition];
+  }
+
+ private:
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+  flushUncompressed();
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+  flushCompressed();
+
+  void flushStart(IOBufOutputStream& out, uint32_t partition, char codecMask)
+      const;
+
+  void flushFinish(
+      IOBufOutputStream& out,
+      uint32_t partition,
+      std::streampos beginOffset,
+      char codecMask,
+      PrestoOutputStreamListener& listener) const;
+
+  void flushRowChildren(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const RowType& rowSchema,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushColumn(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const TypePtr& colType,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushSimpleColumn(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const TypePtr& colType,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushSingleSimpleVector(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  template <TypeKind kind>
+  void flushSingleFlatVector(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushHeader(
+      std::string_view name,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushRowCounts(
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushNulls(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  template <typename T>
+  void flushFlatValues(
+      const T* partitionedValues,
+      const uint64_t* rawNulls,
+      const vector_size_t* partitionOffsets,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushSequentialOffsets(
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  RowTypePtr type_;
+  uint32_t numPartitions_;
+  SerdeOpts opts_;
+  memory::MemoryPool* pool_;
+
+  /// Cumulative row count per partition across all appended batches.
+  std::vector<vector_size_t> rowsPerPartition_;
+
+  /// Number of top-level columns in `type_`.
+  uint32_t numColumns_{0};
+
+  std::vector<PartitionedVectorPtr> partitionedRowVectors_;
+
+  int64_t bytesBuffered_{0};
+  int64_t rowsBuffered_{0};
+
+  /// Per-column, per-partition exact byte counts computed during flush.
+  std::vector<std::vector<int64_t>> flushSizes_;
+};
+
+} // namespace facebook::velox::serializer::presto
diff --git a/velox/serializers/benchmarks/CMakeLists.txt b/velox/serializers/benchmarks/CMakeLists.txt
index 7d1044e4367..a81530595e8 100644
--- a/velox/serializers/benchmarks/CMakeLists.txt
+++ b/velox/serializers/benchmarks/CMakeLists.txt
@@ -21,3 +21,17 @@ target_link_libraries(
   Folly::folly
   Folly::follybenchmark
 )
+
+add_executable(
+  velox_presto_iterative_partitioning_serializer_benchmark
+  PrestoIterativePartitioningSerializerBenchmark.cpp
+)
+
+target_link_libraries(
+  velox_presto_iterative_partitioning_serializer_benchmark
+  velox_presto_serializer
+  velox_vector_test_lib
+  velox_memory
+  Folly::folly
+  Folly::follybenchmark
+)
diff --git a/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
new file mode 100644
index 00000000000..3244281a5dc
--- /dev/null
+++ b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::serializer::presto;
+
+constexpr int64_t kBufferSize = 2 * 1024 * 1024;
+
+namespace {
+
+class PrestoIterativePartitioningSerializerBenchmark
+    : public test::VectorTestBase {
+ public:
+  /// Creates a flat vector of type T with deterministic null pattern.
+  /// Rows where (row % 100) < nullPct are null.
+  template <typename T>
+  VectorPtr makeColumnOfType(vector_size_t size, int32_t nullPct) {
+    if (nullPct == 0) {
+      return makeFlatVector<T>(
+          size, [](auto row) { return static_cast<T>(row); });
+    }
+    return makeFlatVector<T>(
+        size,
+        [](auto row) { return static_cast<T>(row); },
+        [nullPct](auto row) { return (row % 100) < nullPct; });
+  }
+
+  /// Creates a flat vector of the given TypeKind with the given null ratio.
+  VectorPtr makeColumn(vector_size_t size, TypeKind colKind, int32_t nullPct) {
+    switch (colKind) {
+      case TypeKind::BOOLEAN:
+        return makeColumnOfType<bool>(size, nullPct);
+      case TypeKind::INTEGER:
+        return makeColumnOfType<int32_t>(size, nullPct);
+      case TypeKind::BIGINT:
+        return makeColumnOfType<int64_t>(size, nullPct);
+      case TypeKind::HUGEINT:
+        return makeColumnOfType<int128_t>(size, nullPct);
+      default:
+        VELOX_UNSUPPORTED(
+            "Unsupported TypeKind: {}", TypeKindName::toName(colKind));
+    }
+  }
+
+  /// Creates a RowVector with numCols columns of the given TypeKind.
+  RowVectorPtr makeInput(
+      vector_size_t size,
+      TypeKind colKind,
+      uint32_t numCols,
+      int32_t nullPct) {
+    std::vector<std::string> names;
+    std::vector<VectorPtr> children;
+    names.reserve(numCols);
+    children.reserve(numCols);
+    for (uint32_t i = 0; i < numCols; ++i) {
+      names.push_back(fmt::format("c{}", i));
+      children.push_back(makeColumn(size, colKind, nullPct));
+    }
+    return makeRowVector(names, children);
+  }
+
+  std::vector<uint32_t> makePartitions(
+      vector_size_t size,
+      uint32_t numPartitions) {
+    std::vector<uint32_t> partitions(size);
+    for (vector_size_t i = 0; i < size; ++i) {
+      partitions[i] = i % numPartitions;
+    }
+    return partitions;
+  }
+
+  std::unique_ptr<PrestoIterativePartitioningSerializer> makeSerializer(
+      const RowTypePtr& type,
+      uint32_t numPartitions) {
+    SerdeOpts opts;
+    return std::make_unique<PrestoIterativePartitioningSerializer>(
+        type, numPartitions, opts, pool_.get());
+  }
+};
+
+} // namespace
+
+/// Single benchmark function parameterized by (colKind, numCols, nullPct,
+/// numPartitions). Registered via BENCHMARK_NAMED_PARAM below.
+///
+/// All runs use 10'000 rows. Setup (input creation, serializer construction,
+/// append) is excluded from the measured time.
+void benchmarkFlush(
+    uint32_t /* iters */,
+    TypeKind colKind,
+    uint32_t numCols,
+    int32_t nullPct,
+    uint32_t numPartitions) {
+  folly::BenchmarkSuspender suspender;
+  PrestoIterativePartitioningSerializerBenchmark benchmark;
+  auto input = benchmark.makeInput(10'000, colKind, numCols, nullPct);
+  auto parts = benchmark.makePartitions(10'000, numPartitions);
+  auto serializer = benchmark.makeSerializer(
+      std::static_pointer_cast<const RowType>(input->type()), numPartitions);
+
+  while (serializer->bytesBuffered() < kBufferSize) {
+    serializer->append(input, parts);
+  }
+
+  suspender.dismiss();
+
+  auto result = serializer->flush();
+  folly::doNotOptimizeAway(result);
+}
+
+// clang-format off
+// Dimensions:
+//   col type:       {bool, int, bigint, hugeint}
+//   num cols:       {1, 4, 16, 64}
+//   null pct:       {0, 25, 50, 75, 100}
+//   num partitions: {1, 4, 16, 64, 256, 1024}
+//
+// Naming: flush_<type>_<N>cols_<P>pct_<K>parts
+
+#define FLUSH_PARAM(type_name, kind, num_cols, null_pct, num_parts) \
+  BENCHMARK_NAMED_PARAM(                                            \
+      benchmarkFlush,                                               \
+      type_name## _## num_cols## cols_## null_pct## pct_## num_parts## parts, \
+      TypeKind::kind, num_cols, null_pct, num_parts)
+
+#define FLUSH_FOR_PARTS(type_name, kind, num_cols, null_pct) \
+  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 1)        \
+  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 4)        \
+  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 16)       \
+  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 64)       \
+  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 256)      \
+  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 1024)
+
+#define FLUSH_FOR_NULLS(type_name, kind, num_cols) \
+  FLUSH_FOR_PARTS(type_name, kind, num_cols, 0)    \
+  FLUSH_FOR_PARTS(type_name, kind, num_cols, 25)   \
+  FLUSH_FOR_PARTS(type_name, kind, num_cols, 50)   \
+  FLUSH_FOR_PARTS(type_name, kind, num_cols, 75)   \
+  FLUSH_FOR_PARTS(type_name, kind, num_cols, 100)
+
+#define FLUSH_FOR_COLS(type_name, kind) \
+  FLUSH_FOR_NULLS(type_name, kind, 1)   \
+  FLUSH_FOR_NULLS(type_name, kind, 4)   \
+  FLUSH_FOR_NULLS(type_name, kind, 16)  \
+  FLUSH_FOR_NULLS(type_name, kind, 64)
+
+FLUSH_FOR_COLS(bool, BOOLEAN)
+FLUSH_FOR_COLS(int, INTEGER)
+FLUSH_FOR_COLS(bigint, BIGINT)
+FLUSH_FOR_COLS(ldec, HUGEINT)
+// clang-format on
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  PrestoVectorSerde::registerVectorSerde();
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/serializers/tests/CMakeLists.txt b/velox/serializers/tests/CMakeLists.txt
index f7f69461ef4..2d1a40275b5 100644
--- a/velox/serializers/tests/CMakeLists.txt
+++ b/velox/serializers/tests/CMakeLists.txt
@@ -36,6 +36,7 @@ target_link_libraries(
 set(
   VELOX_SERIALIZER_TEST_SOURCES
   CompactRowSerializerTest.cpp
+  PrestoIterativePartitioningSerializerTest.cpp
   PrestoOutputStreamListenerTest.cpp
   PrestoSerializerTest.cpp
   SerializedPageFileTest.cpp
@@ -51,6 +52,7 @@ set(
   velox_row_fast
   GTest::gtest
   GTest::gtest_main
+  GTest::gmock
   glog::glog
 )
 
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
new file mode 100644
index 00000000000..e315684d811
--- /dev/null
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -0,0 +1,661 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::serializer::presto;
+using namespace facebook::velox::test;
+
+// ---------------------------------------------------------------------------
+// Shared base fixture
+// ---------------------------------------------------------------------------
+
+class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase {
+ protected:
+  static void SetUpTestSuite() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+    if (!isRegisteredVectorSerde()) {
+      PrestoVectorSerde::registerVectorSerde();
+    }
+  }
+
+  /// Deserializes an IOBuf produced by PartitioningSerializer::flush().
+  RowVectorPtr deserialize(folly::IOBuf& iobuf, const RowTypePtr& type) {
+    auto ranges = byteRangesFromIOBuf(&iobuf);
+    BufferInputStream stream(std::move(ranges));
+    RowVectorPtr result;
+    serde_.deserialize(&stream, pool_.get(), type, &result, nullptr);
+    return result;
+  }
+
+  /// Extracts flat values from a column into a sorted vector.
+  template <typename T>
+  std::vector<T> sortedValues(const RowVectorPtr& row, int column) {
+    auto* flat = row->childAt(column)->as<FlatVector<T>>();
+    std::vector<T> vals(flat->rawValues(), flat->rawValues() + row->size());
+    std::sort(vals.begin(), vals.end());
+    return vals;
+  }
+
+  /// Extracts values from a nullable column, preserving order and nulls.
+  template <typename T>
+  std::vector<std::optional<T>> nullableValues(
+      const RowVectorPtr& row,
+      int column) {
+    auto* vec = row->childAt(column).get();
+    std::vector<std::optional<T>> result;
+    result.reserve(row->size());
+    for (int i = 0; i < row->size(); ++i) {
+      if (vec->isNullAt(i)) {
+        result.push_back(std::nullopt);
+      } else {
+        result.push_back(vec->as<FlatVector<T>>()->valueAt(i));
+      }
+    }
+    return result;
+  }
+
+  /// Builds a PrestoIterativePartitioningSerializer with default serde options.
+  std::unique_ptr<PrestoIterativePartitioningSerializer> makeSerializer(
+      const RowTypePtr& type,
+      uint32_t numPartitions) {
+    SerdeOpts opts;
+    return std::make_unique<PrestoIterativePartitioningSerializer>(
+        type, numPartitions, opts, pool_.get());
+  }
+
+  PrestoVectorSerde serde_;
+};
+
+// ---------------------------------------------------------------------------
+// Value-parameterized fixture — routing, null-handling over scalar TypePtrs.
+// Uses BaseVector::create() + setNull() so no C++ type dispatch is needed.
+// ---------------------------------------------------------------------------
+
+class PrestoIterativePartitioningSerializerParamTest
+    : public ::testing::TestWithParam<TypePtr>,
+      public PrestoIterativePartitioningSerializerTestBase {
+ public:
+  static void SetUpTestSuite() {
+    PrestoIterativePartitioningSerializerTestBase::SetUpTestSuite();
+  }
+};
+
+// Short lowercase names for test output, matching the benchmark convention.
+std::string scalarTypeName(const TypePtr& type) {
+  if (type->kind() == TypeKind::BOOLEAN)
+    return "bool";
+  if (type->kind() == TypeKind::INTEGER)
+    return "int";
+  if (type->kind() == TypeKind::BIGINT)
+    return "bigint";
+  if (type->kind() == TypeKind::HUGEINT)
+    return "hugeint";
+  return type->toString();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ScalarTypes,
+    PrestoIterativePartitioningSerializerParamTest,
+    ::testing::Values(BOOLEAN(), INTEGER(), BIGINT(), HUGEINT()),
+    [](const ::testing::TestParamInfo<TypePtr>& info) {
+      return scalarTypeName(info.param);
+    });
+
+// ── Routing ──────────────────────────────────────────────────────────────────
+
+// Single append, two equal-sized partitions; also verifies rowsBuffered and
+// bytesBuffered lifecycle counters.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, basicTwoPartitions) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 6, pool_.get());
+  auto input = makeRowVector({"a"}, {col});
+
+  // Even rows → partition 0, odd rows → partition 1.
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 1, 0, 1, 0, 1});
+
+  EXPECT_EQ(serializer->rowsBuffered(), 6);
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  EXPECT_EQ(serializer->rowsBuffered(), 0);
+  EXPECT_EQ(serializer->bytesBuffered(), 0);
+
+  auto p0 = deserialize(*ioBufs.at(0).first, type);
+  auto p1 = deserialize(*ioBufs.at(1).first, type);
+
+  EXPECT_EQ(p0->size(), 3);
+  EXPECT_EQ(p1->size(), 3);
+}
+
+// All rows routed to one non-zero partition; other partitions are absent.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, allRowsToOnePartition) {
+  auto colType = GetParam();
+  auto type = ROW({"x"}, {colType});
+  auto col = BaseVector::create(colType, 5, pool_.get());
+  auto input = makeRowVector({"x"}, {col});
+
+  auto serializer = makeSerializer(type, 4);
+  serializer->append(input, {2, 2, 2, 2, 2});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 1);
+  ASSERT_TRUE(ioBufs.count(2));
+  EXPECT_EQ(deserialize(*ioBufs.at(2).first, type)->size(), 5);
+}
+
+// Single partition (numPartitions=1): all rows go to partition 0.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, singlePartition) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 5, pool_.get());
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 1);
+  serializer->append(input, std::vector<uint32_t>(5, 0));
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 1);
+  EXPECT_EQ(deserialize(*ioBufs.at(0).first, type)->size(), 5);
+}
+
+// Multiple columns of the same type: each is serialized independently by
+// flushRowChildren.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, multipleColumns) {
+  auto colType = GetParam();
+  auto type = ROW({"a", "b"}, {colType, colType});
+  auto colA = BaseVector::create(colType, 4, pool_.get());
+  auto colB = BaseVector::create(colType, 4, pool_.get());
+  auto input = makeRowVector({"a", "b"}, {colA, colB});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  EXPECT_EQ(r0->size(), 2);
+  EXPECT_EQ(r0->childAt(0)->size(), 2);
+  EXPECT_EQ(r0->childAt(1)->size(), 2);
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  EXPECT_EQ(r1->size(), 2);
+  EXPECT_EQ(r1->childAt(0)->size(), 2);
+  EXPECT_EQ(r1->childAt(1)->size(), 2);
+}
+
+// ── Null handling
+// ─────────────────────────────────────────────────────────────
+
+// Nulls appear only in one partition; the other partition is null-free.
+// Rows 0,1,2 → p0; rows 3,4 → p1. Row 1 is null.
+// p0: [not-null, null, not-null]; p1: [not-null, not-null].
+TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsInOnePartition) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 5, pool_.get());
+  col->setNull(1, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 0, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 3);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(2));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// Nulls contributed by different appends to the same partition.
+// Append 1: rows 0,1 → p0 (row 1 null); row 2 → p1.
+// Append 2: row 0 → p0 (null); row 1 → p1.
+// p0: [not-null, null, null]; p1: [not-null, not-null].
+TEST_P(
+    PrestoIterativePartitioningSerializerParamTest,
+    nullsAcrossMultipleAppends) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto serializer = makeSerializer(type, 2);
+
+  auto col1 = BaseVector::create(colType, 3, pool_.get());
+  col1->setNull(1, true);
+  serializer->append(makeRowVector({"a"}, {col1}), {0, 0, 1});
+
+  auto col2 = BaseVector::create(colType, 2, pool_.get());
+  col2->setNull(0, true);
+  serializer->append(makeRowVector({"a"}, {col2}), {0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 3);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(2));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// Partition boundary falls in the middle of a null-bitmap byte, exercising the
+// bit-extraction carry-over logic. 5 rows → p0, 4 rows → p1. The boundary at
+// bit 5 is inside the first byte of the null bitmap. Rows 1,3,5,7 are null.
+// p0: [not-null, null, not-null, null, not-null].
+// p1: [null, not-null, null, not-null].
+TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsUnalignedBoundary) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 9, pool_.get());
+  col->setNull(1, true);
+  col->setNull(3, true);
+  col->setNull(5, true);
+  col->setNull(7, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 0, 0, 0, 1, 1, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 5);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(2));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(3));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(4));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 4);
+  EXPECT_TRUE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+  EXPECT_TRUE(r1->childAt(0)->isNullAt(2));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(3));
+}
+
+// Both partitions contain nulls.
+// Input: 4 rows, rows 1 and 2 null; rows 0,1 → p0; rows 2,3 → p1.
+// p0: [not-null, null]; p1: [null, not-null].
+TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsInBothPartitions) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 4, pool_.get());
+  col->setNull(1, true);
+  col->setNull(2, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 2);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_TRUE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// All rows in one partition are null; the other partition is non-null.
+// Input: 3 rows, rows 0,1 null; rows 0,1 → p0; row 2 → p1.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, allNullsInPartition) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 3, pool_.get());
+  col->setNull(0, true);
+  col->setNull(1, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 2);
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 1);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+}
+
+// A null batch followed by a null-free batch for the same partition.
+// Regression: bitmaps must be initialized to all-not-null so that rows from
+// the null-free batch (rawNulls == nullptr) are not decoded as null.
+TEST_P(
+    PrestoIterativePartitioningSerializerParamTest,
+    nullBatchFollowedByNullFreeBatch) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto serializer = makeSerializer(type, 2);
+
+  // Append 1: row 0 → p0 (null); row 1 → p1 (not-null).  rawNulls non-null.
+  auto col1 = BaseVector::create(colType, 2, pool_.get());
+  col1->setNull(0, true);
+  serializer->append(makeRowVector({"a"}, {col1}), {0, 1});
+
+  // Append 2: all not-null (rawNulls == nullptr).  row 0 → p0; row 1 → p1.
+  auto col2 = BaseVector::create(colType, 2, pool_.get());
+  serializer->append(makeRowVector({"a"}, {col2}), {0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  // p0: [null (append 1), not-null (append 2)]
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 2);
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(1));
+
+  // p1: [not-null (append 1), not-null (append 2)]
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// ---------------------------------------------------------------------------
+// Non-typed fixture (TEST_F) — lifecycle, structural, regression
+// ---------------------------------------------------------------------------
+
+class PrestoIterativePartitioningSerializerTest
+    : public ::testing::Test,
+      public PrestoIterativePartitioningSerializerTestBase {
+ public:
+  static void SetUpTestSuite() {
+    PrestoIterativePartitioningSerializerTestBase::SetUpTestSuite();
+  }
+};
+
+// Appending an empty RowVector produces no ioBufs on flush.
+TEST_F(PrestoIterativePartitioningSerializerTest, appendEmptyVector) {
+  auto type = ROW({"a"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(makeRowVector({"a"}, {makeFlatVector<int64_t>({})}), {});
+  EXPECT_TRUE(serializer->flush().empty());
+}
+
+// ── Lifecycle
+// ─────────────────────────────────────────────────────────────────
+
+// Multiple append() calls accumulate correctly before flush.
+TEST_F(PrestoIterativePartitioningSerializerTest, multipleAppends) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 3);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({100, 200, 300})}),
+      {0, 1, 2});
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({400, 500, 600})}),
+      {2, 0, 1});
+
+  EXPECT_EQ(serializer->rowsBuffered(), 6);
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 3);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  auto r2 = deserialize(*ioBufs.at(2).first, type);
+
+  ASSERT_EQ(r0->size(), 2);
+  ASSERT_EQ(r1->size(), 2);
+  ASSERT_EQ(r2->size(), 2);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{100, 500}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{200, 600}));
+  EXPECT_EQ(sortedValues<int64_t>(r2, 0), (std::vector<int64_t>{300, 400}));
+}
+
+// Flush twice: second flush on empty state returns an empty map.
+TEST_F(PrestoIterativePartitioningSerializerTest, flushTwice) {
+  auto type = ROW({"a"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(
+      makeRowVector({"a"}, {makeFlatVector<int64_t>({10, 20})}), {0, 1});
+
+  auto ioBufs1 = serializer->flush();
+  ASSERT_EQ(ioBufs1.size(), 2);
+
+  EXPECT_TRUE(serializer->flush().empty());
+}
+
+// Append and flush multiple independent cycles.
+TEST_F(PrestoIterativePartitioningSerializerTest, multipleCycles) {
+  auto type = ROW({"a"}, {INTEGER()});
+  auto serializer = makeSerializer(type, 2);
+
+  for (int cycle = 0; cycle < 3; ++cycle) {
+    serializer->append(
+        makeRowVector(
+            {"a"}, {makeFlatVector<int32_t>({cycle * 2, cycle * 2 + 1})}),
+        {0, 1});
+    auto ioBufs = serializer->flush();
+    ASSERT_EQ(ioBufs.size(), 2) << "cycle " << cycle;
+
+    auto r0 = deserialize(*ioBufs.at(0).first, type);
+    auto r1 = deserialize(*ioBufs.at(1).first, type);
+    ASSERT_EQ(r0->size(), 1) << "cycle " << cycle;
+    ASSERT_EQ(r1->size(), 1) << "cycle " << cycle;
+    EXPECT_EQ(r0->childAt(0)->as<FlatVector<int32_t>>()->valueAt(0), cycle * 2);
+    EXPECT_EQ(
+        r1->childAt(0)->as<FlatVector<int32_t>>()->valueAt(0), cycle * 2 + 1);
+  }
+}
+
+// ── Scale and regression
+// ───────────────────────────────────────────────────────
+
+// 1024 partitions with random int64 values: verify every value reaches
+// exactly the right partition and nothing is lost or duplicated.
+TEST_F(PrestoIterativePartitioningSerializerTest, manyPartitionsRandom) {
+  constexpr uint32_t kNumPartitions = 1024;
+  constexpr int32_t kNumRows = 64'000;
+
+  std::mt19937_64 rng(42);
+  std::uniform_int_distribution<int64_t> valueDist;
+  std::uniform_int_distribution<uint32_t> partDist(0, kNumPartitions - 1);
+
+  std::vector<int64_t> inputValues(kNumRows);
+  std::vector<uint32_t> partitions(kNumRows);
+  // expected[p] holds the sorted values assigned to partition p.
+  std::vector<std::vector<int64_t>> expected(kNumPartitions);
+
+  for (int i = 0; i < kNumRows; ++i) {
+    inputValues[i] = valueDist(rng);
+    partitions[i] = partDist(rng);
+    expected[partitions[i]].push_back(inputValues[i]);
+  }
+  for (auto& v : expected) {
+    std::sort(v.begin(), v.end());
+  }
+
+  auto type = ROW({"v"}, {BIGINT()});
+  auto input = makeRowVector({"v"}, {makeFlatVector<int64_t>(inputValues)});
+
+  auto serializer = makeSerializer(type, kNumPartitions);
+  serializer->append(input, partitions);
+  auto ioBufs = serializer->flush();
+
+  // Every non-empty partition must have a page; empty partitions must not.
+  for (uint32_t p = 0; p < kNumPartitions; ++p) {
+    if (expected[p].empty()) {
+      EXPECT_EQ(ioBufs.count(p), 0) << "partition " << p;
+    } else {
+      ASSERT_EQ(ioBufs.count(p), 1) << "partition " << p;
+      auto result = deserialize(*ioBufs.at(p).first, type);
+      ASSERT_EQ(result->size(), static_cast<int32_t>(expected[p].size()))
+          << "partition " << p;
+      EXPECT_EQ(sortedValues<int64_t>(result, 0), expected[p])
+          << "partition " << p;
+    }
+  }
+}
+
+// 1024 partitions with random int64 values and ~25% nulls: verify every
+// value and null reaches exactly the right partition in input order, and
+// nothing is lost or duplicated.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    manyPartitionsRandomWithNulls) {
+  constexpr uint32_t kNumPartitions = 1024;
+  constexpr int32_t kNumRows = 64'000;
+  constexpr int32_t kNullPct = 25;
+
+  std::mt19937_64 rng(43);
+  std::uniform_int_distribution<int64_t> valueDist;
+  std::uniform_int_distribution<uint32_t> partDist(0, kNumPartitions - 1);
+  std::uniform_int_distribution<int32_t> nullDist(0, 99);
+
+  std::vector<std::optional<int64_t>> inputValues(kNumRows);
+  std::vector<uint32_t> partitions(kNumRows);
+  // expected[p] holds the sequence of (value-or-null) assigned to partition p
+  // in input order.
+  std::vector<std::vector<std::optional<int64_t>>> expected(kNumPartitions);
+
+  for (int i = 0; i < kNumRows; ++i) {
+    partitions[i] = partDist(rng);
+    if (nullDist(rng) < kNullPct) {
+      inputValues[i] = std::nullopt;
+    } else {
+      inputValues[i] = valueDist(rng);
+    }
+    expected[partitions[i]].push_back(inputValues[i]);
+  }
+
+  auto type = ROW({"v"}, {BIGINT()});
+  auto input =
+      makeRowVector({"v"}, {makeNullableFlatVector<int64_t>(inputValues)});
+
+  auto serializer = makeSerializer(type, kNumPartitions);
+  serializer->append(input, partitions);
+  auto ioBufs = serializer->flush();
+
+  // Partition rearranges values within each partition, so compare sorted.
+  // std::optional<T> sorts with nullopt < any value, preserving null count.
+  for (uint32_t p = 0; p < kNumPartitions; ++p) {
+    if (expected[p].empty()) {
+      EXPECT_EQ(ioBufs.count(p), 0) << "partition " << p;
+    } else {
+      ASSERT_EQ(ioBufs.count(p), 1) << "partition " << p;
+      auto result = deserialize(*ioBufs.at(p).first, type);
+      ASSERT_EQ(result->size(), static_cast<int32_t>(expected[p].size()))
+          << "partition " << p;
+
+      auto expectedSorted = expected[p];
+      std::sort(expectedSorted.begin(), expectedSorted.end());
+
+      auto actual = nullableValues<int64_t>(result, 0);
+      std::sort(actual.begin(), actual.end());
+
+      EXPECT_EQ(actual, expectedSorted) << "partition " << p;
+    }
+  }
+}
+
+// Regression: flushNulls previously wrote null bitmaps by obtaining a raw
+// pointer via writePosition() then advancing the stream via seekp(). This
+// assumed the pre-allocated IOBufOutputStream had a single contiguous range,
+// but StreamArena::newRange caps each range at the size of one allocator run,
+// which can be smaller than the requested size. seekp() then failed because
+// the target position exceeded the end of the first (and only) range.
+//
+// Reproducing condition: 16 columns × 10'000 rows × 50% nulls in one
+// partition generates enough output (~100 KB) to trigger the run-size cap.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    flushNullsBitmapManyColumnsLargeRowCount) {
+  constexpr int32_t kNumCols = 16;
+  constexpr int32_t kNumRows = 10'000;
+
+  std::vector<std::string> names;
+  std::vector<VectorPtr> children;
+  names.reserve(kNumCols);
+  children.reserve(kNumCols);
+
+  for (int col = 0; col < kNumCols; ++col) {
+    names.push_back(fmt::format("c{}", col));
+    // Rows where (row % 2 == 0) are null; the rest hold (row * kNumCols + col).
+    children.push_back(
+        makeFlatVector<int64_t>(
+            kNumRows,
+            [col](auto row) {
+              return static_cast<int64_t>(row * kNumCols + col);
+            },
+            [](auto row) { return (row % 2) == 0; }));
+  }
+
+  auto input = makeRowVector(names, children);
+  auto rowType = std::static_pointer_cast<const RowType>(input->type());
+
+  auto serializer = makeSerializer(rowType, 1);
+  serializer->append(input, std::vector<uint32_t>(kNumRows, 0));
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 1);
+
+  auto result = deserialize(*ioBufs.at(0).first, rowType);
+  ASSERT_EQ(result->size(), kNumRows);
+
+  for (int col = 0; col < kNumCols; ++col) {
+    auto* flat = result->childAt(col)->as<FlatVector<int64_t>>();
+    for (int row = 0; row < kNumRows; ++row) {
+      if ((row % 2) == 0) {
+        EXPECT_TRUE(result->childAt(col)->isNullAt(row))
+            << "col=" << col << " row=" << row;
+      } else {
+        ASSERT_FALSE(result->childAt(col)->isNullAt(row))
+            << "col=" << col << " row=" << row;
+        EXPECT_EQ(
+            flat->valueAt(row), static_cast<int64_t>(row * kNumCols + col))
+            << "col=" << col << " row=" << row;
+      }
+    }
+  }
+}

From c114147d54c993712e4a25e2cb6f2f5123661391 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Wed, 1 Apr 2026 02:12:25 -0700
Subject: [PATCH 10/24] feat(PartitionedOutput): Add OptimizedPartitionedOutput
 operator

This commit introduces OptimizedPartitionedOutput, a PartitionedOutput
operator backed by PrestoIterativePartitioningSerializer. Enabled via query
config key "optimized_repartitioning" (default off). LocalPlanner
selects it over the standard PartitionedOutput when the flag is set.

TODO: replicateNullsAndAny is not yet supported and raises a user error.
---
 velox/core/QueryConfig.h                      |   8 +
 velox/exec/CMakeLists.txt                     |   1 +
 velox/exec/LocalPlanner.cpp                   |  13 +-
 velox/exec/OptimizedPartitionedOutput.cpp     | 202 +++++
 velox/exec/OptimizedPartitionedOutput.h       | 103 +++
 velox/exec/tests/CMakeLists.txt               |   1 +
 .../tests/OptimizedPartitionedOutputTest.cpp  | 787 ++++++++++++++++++
 7 files changed, 1112 insertions(+), 3 deletions(-)
 create mode 100644 velox/exec/OptimizedPartitionedOutput.cpp
 create mode 100644 velox/exec/OptimizedPartitionedOutput.h
 create mode 100644 velox/exec/tests/OptimizedPartitionedOutputTest.cpp

diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
index 772015daa1f..0571284aedc 100644
--- a/velox/core/QueryConfig.h
+++ b/velox/core/QueryConfig.h
@@ -1469,6 +1469,14 @@ class QueryConfig {
       1000,
       "Batch size threshold for zero-copy in MarkSorted operator.")
 
+  VELOX_QUERY_CONFIG(
+      kOptimizedPartitionedOutputEnabled,
+      optimizedPartitionedOutputEnabled,
+      "optimized_repartitioning",
+      bool,
+      false,
+      "Enable OptimizedPartitionedOutput operator.");
+
   // --- Hand-written accessors for properties that need custom logic ---
 
   // Generated by VELOX_QUERY_CONFIG for simple properties above.
diff --git a/velox/exec/CMakeLists.txt b/velox/exec/CMakeLists.txt
index 3a5bec7e6e8..53b88bd28e3 100644
--- a/velox/exec/CMakeLists.txt
+++ b/velox/exec/CMakeLists.txt
@@ -71,6 +71,7 @@ velox_add_library(
   OperatorTraceScan.cpp
   OperatorTraceWriter.cpp
   OperatorUtils.cpp
+  OptimizedPartitionedOutput.cpp
   OrderBy.cpp
   OutputBuffer.cpp
   OutputBufferManager.cpp
diff --git a/velox/exec/LocalPlanner.cpp b/velox/exec/LocalPlanner.cpp
index 39f009fe39a..a46daa8b4f1 100644
--- a/velox/exec/LocalPlanner.cpp
+++ b/velox/exec/LocalPlanner.cpp
@@ -37,6 +37,7 @@
 #include "velox/exec/NestedLoopJoinBuild.h"
 #include "velox/exec/NestedLoopJoinProbe.h"
 #include "velox/exec/OperatorTraceScan.h"
+#include "velox/exec/OptimizedPartitionedOutput.h"
 #include "velox/exec/OrderBy.h"
 #include "velox/exec/ParallelProject.h"
 #include "velox/exec/PartitionedOutput.h"
@@ -553,9 +554,15 @@ std::shared_ptr<Driver> DriverFactory::createDriver(
         auto partitionedOutputNode =
             std::dynamic_pointer_cast<const core::PartitionedOutputNode>(
                 planNode)) {
-      operators.push_back(
-          std::make_unique<PartitionedOutput>(
-              id, ctx.get(), partitionedOutputNode, eagerFlush(*planNode)));
+      if (ctx->queryConfig().optimizedPartitionedOutputEnabled()) {
+        operators.push_back(
+            std::make_unique<OptimizedPartitionedOutput>(
+                id, ctx.get(), partitionedOutputNode));
+      } else {
+        operators.push_back(
+            std::make_unique<PartitionedOutput>(
+                id, ctx.get(), partitionedOutputNode, eagerFlush(*planNode)));
+      }
     } else if (
         auto joinNode =
             std::dynamic_pointer_cast<const core::HashJoinNode>(planNode)) {
diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp
new file mode 100644
index 00000000000..bad3ea49378
--- /dev/null
+++ b/velox/exec/OptimizedPartitionedOutput.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/OptimizedPartitionedOutput.h"
+
+#include "velox/exec/HashPartitionFunction.h"
+#include "velox/exec/SerializedPage.h"
+#include "velox/exec/Task.h"
+
+namespace facebook::velox::exec {
+
+OptimizedPartitionedOutput::OptimizedPartitionedOutput(
+    int32_t operatorId,
+    DriverCtx* ctx,
+    const std::shared_ptr<const core::PartitionedOutputNode>& planNode)
+    : Operator(
+          ctx,
+          planNode->outputType(),
+          operatorId,
+          planNode->id(),
+          "OptimizedPartitionedOutput"),
+      taskId_(operatorCtx_->taskId()),
+      inputType_(planNode->inputType()),
+      keyChannels_(toChannels(planNode->inputType(), planNode->keys())),
+      outputChannels_(calculateOutputChannels(
+          planNode->inputType(),
+          planNode->outputType(),
+          planNode->outputType())),
+      numDestinations_(planNode->numPartitions()),
+      replicateNullsAndAny_(planNode->isReplicateNullsAndAny()),
+      bufferManager_(OutputBufferManager::getInstanceRef()),
+      // NOTE: 'bufferReleaseFn_' holds a reference on the associated task to
+      // prevent it from deleting while there are output buffers being accessed
+      // out of the partitioned output buffer manager such as in Prestissimo,
+      // the http server holds the buffers while sending the data response.
+      bufferReleaseFn_([task = operatorCtx_->task()]() {}),
+      maxOutputBufferBytes_(ctx->task->queryCtx()
+                                ->queryConfig()
+                                .maxPartitionedOutputBufferSize()),
+      pool_(pool()),
+      partitionFunction_(
+          numDestinations_ == 1
+              ? nullptr
+              : planNode->partitionFunctionSpec().create(numDestinations_)) {
+  if (!planNode->isPartitioned()) {
+    VELOX_USER_CHECK_EQ(numDestinations_, 1);
+  }
+  if (numDestinations_ == 1) {
+    VELOX_USER_CHECK(keyChannels_.empty());
+  }
+
+  serializer::presto::SerdeOpts options;
+  options.compressionKind = common::stringToCompressionKind(
+      operatorCtx_->driverCtx()->queryConfig().shuffleCompressionKind());
+  options.minCompressionRatio = 0.8;
+
+  serializer_ = std::make_unique<
+      serializer::presto::PrestoIterativePartitioningSerializer>(
+      inputType_, numDestinations_, options, pool_);
+}
+
+void OptimizedPartitionedOutput::addInput(RowVectorPtr input) {
+  VELOX_USER_CHECK(
+      !replicateNullsAndAny_,
+      "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput");
+
+  if (serializer_->bytesBuffered() + input->retainedSize() >=
+      maxOutputBufferBytes_) {
+    flush();
+  }
+
+  const auto numRows = input->size();
+  partitions_.resize(numRows);
+
+  if (numDestinations_ == 1) {
+    std::fill(partitions_.begin(), partitions_.end(), 0u);
+  } else {
+    std::optional<uint32_t> partition =
+        partitionFunction_->partition(*input, partitions_);
+    if (partition.has_value()) {
+      // All rows go to the same partition
+      std::fill(partitions_.begin(), partitions_.end(), partition.value());
+    }
+  }
+
+  serializer_->append(input, partitions_);
+
+  auto lockedStats = stats_.wlock();
+  ++numAppends_;
+  lockedStats->addRuntimeStat("numAppends", RuntimeCounter(1));
+}
+
+bool OptimizedPartitionedOutput::needsInput() const {
+  return blockingReason_ == BlockingReason::kNotBlocked;
+}
+
+RowVectorPtr OptimizedPartitionedOutput::getOutput() {
+  if (finished_) {
+    return nullptr;
+  }
+
+  blockingReason_ = BlockingReason::kNotBlocked;
+
+  if (noMoreInput_ || serializer_->bytesBuffered() >= maxOutputBufferBytes_) {
+    flush();
+  }
+
+  // If blocked, stop here. We avoid advancing operator state while blocked,
+  // even if noMoreInput_ may already be true. The driver will resume and call
+  // getOutput() again once the OutputBuffer has space.
+  if (blockingReason_ != BlockingReason::kNotBlocked) {
+    return nullptr;
+  }
+
+  if (noMoreInput_ && serializer_->bytesBuffered() == 0) {
+    // TODO: merge serializer runtime stats into operator stats once
+    // PrestoIterativePartitioningSerializer exposes runtimeStats().
+    bufferManager_.lock()->noMoreData(operatorCtx_->task()->taskId());
+    finished_ = true;
+  }
+
+  return nullptr;
+}
+
+BlockingReason OptimizedPartitionedOutput::isBlocked(ContinueFuture* future) {
+  if (blockingReason_ != BlockingReason::kNotBlocked) {
+    *future = std::move(future_);
+    blockingReason_ = BlockingReason::kNotBlocked;
+    return BlockingReason::kWaitForConsumer;
+  }
+  return BlockingReason::kNotBlocked;
+}
+
+bool OptimizedPartitionedOutput::isFinished() {
+  return finished_;
+}
+
+void OptimizedPartitionedOutput::flush() {
+  const auto flushedBytes = serializer_->bytesBuffered();
+  const auto flushedRows = serializer_->rowsBuffered();
+
+  // This will serialize all destinations and reset serializer_->bytesBuffered()
+  // to 0.
+  auto serializedIOBufs = serializer_->flush();
+  auto bufferManager = bufferManager_.lock();
+  VELOX_CHECK_NOT_NULL(
+      bufferManager, "OutputBufferManager was already destructed");
+
+  bool shouldBlock = false;
+  ContinueFuture future = ContinueFuture::makeEmpty();
+  for (auto& [destination, pageData] : serializedIOBufs) {
+    // We will only pass the future to bufferManager->enqueue() for the first
+    // blocked destination. This is to avoid unnecessary creation of
+    // ContinueFuture objects for the remaining destinations.
+    ContinueFuture* futurePtr = shouldBlock ? nullptr : &future;
+
+    // Enqueue the data for each non-empty partition. Since the pageData is
+    // already serialized, enqueueing them would not cause new memory
+    // allocations. This will always move the pageData to the OutputBuffers no
+    // matter if the OutputBuffer is blocked.
+    bool blocked = bufferManager->enqueue(
+        taskId_,
+        static_cast<int>(destination),
+        std::make_unique<PrestoSerializedPage>(
+            std::move(pageData.first),
+            [fn = bufferReleaseFn_](folly::IOBuf&) { fn(); },
+            pageData.second),
+        futurePtr);
+
+    if (blocked && !shouldBlock) {
+      blockingReason_ = BlockingReason::kWaitForConsumer;
+      shouldBlock = true;
+      future_ = std::move(future);
+    }
+  }
+
+  auto lockedStats = stats_.wlock();
+  lockedStats->addOutputVector(flushedBytes, flushedRows);
+  if (flushedRows > 0) {
+    ++numFlushes_;
+    lockedStats->addRuntimeStat("numFlushes", RuntimeCounter(1));
+  }
+  if (shouldBlock) {
+    ++numBlockedTimes_;
+    lockedStats->addRuntimeStat("numBlockedTimes", RuntimeCounter(1));
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedPartitionedOutput.h b/velox/exec/OptimizedPartitionedOutput.h
new file mode 100644
index 00000000000..0f9dd2e2b47
--- /dev/null
+++ b/velox/exec/OptimizedPartitionedOutput.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/exec/Operator.h"
+#include "velox/exec/OutputBufferManager.h"
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+
+namespace facebook::velox::exec {
+
+/// Partitioned output operator backed by PrestoIterativePartitioningSerializer.
+///
+/// Routes each input row to a partition via a hash function, buffers the
+/// partitioned data, and flushes serialized Presto pages into the output
+/// buffer manager when the buffer is full or the pipeline is draining.
+class OptimizedPartitionedOutput : public Operator {
+ public:
+  /// Minimum flush size for non-final flush; 60 KB + overhead fits a 64 KB
+  /// network MTU.
+  static constexpr uint64_t kMinDestinationSize = 60 * 1024;
+
+  OptimizedPartitionedOutput(
+      int32_t operatorId,
+      DriverCtx* ctx,
+      const std::shared_ptr<const core::PartitionedOutputNode>& planNode);
+
+  void addInput(RowVectorPtr input) override;
+
+  /// Returns true when the operator is not waiting for the output buffer to
+  /// drain. The driver checks this before calling addInput() so a blocked
+  /// state does not accumulate additional rows.
+  bool needsInput() const override;
+
+  /// Always returns nullptr; output is pushed into the buffer manager as a
+  /// side-effect. Flushes the serializer when the buffer is full or the
+  /// pipeline is draining, then signals noMoreData() once all rows are sent.
+  RowVectorPtr getOutput() override;
+
+  BlockingReason isBlocked(ContinueFuture* future) override;
+
+  bool isFinished() override;
+
+ private:
+  /// Serializes all buffered rows into Presto pages and enqueues each page
+  /// into the output buffer manager. All destinations are always enqueued;
+  /// sets blockingReason_ and records a future if the output buffer is full.
+  /// Increments numFlushes_ on each call.
+  void flush();
+
+  const std::string taskId_;
+  /// Input row type; also used as output type (column reordering not yet
+  /// applied).
+  const RowTypePtr inputType_;
+  const std::vector<column_index_t> keyChannels_;
+  /// Non-empty when the output column order differs from the input.
+  const std::vector<column_index_t> outputChannels_;
+  const int32_t numDestinations_;
+
+  const bool replicateNullsAndAny_;
+  const std::weak_ptr<exec::OutputBufferManager> bufferManager_;
+  /// Holds a reference to the owning task to prevent it from being destroyed
+  /// while serialized pages are in flight inside the buffer manager.
+  const std::function<void()> bufferReleaseFn_;
+  const int64_t maxOutputBufferBytes_;
+
+  velox::memory::MemoryPool* pool_;
+  /// Computes per-row partition assignments. Null when numDestinations_ == 1.
+  std::unique_ptr<core::PartitionFunction> partitionFunction_;
+  /// Reusable buffer for per-row partition assignments.
+  std::vector<uint32_t> partitions_;
+  std::unique_ptr<serializer::presto::PrestoIterativePartitioningSerializer>
+      serializer_;
+
+  BlockingReason blockingReason_{BlockingReason::kNotBlocked};
+  ContinueFuture future_;
+  bool finished_{false};
+
+  /// Counts addInput() calls that appended at least one row to the serializer.
+  /// Exposed as the "numAppendTimes" runtime stat.
+  uint64_t numAppends_{0};
+  /// Counts non-empty flush() calls — flushes that serialized at least one
+  /// row. Exposed as the "numFlushes" runtime stat for test verification.
+  uint64_t numFlushes_{0};
+  /// Counts flush() calls that caused the driver to block on a full output
+  /// buffer. Exposed as the "numBlockedTimes" runtime stat.
+  uint64_t numBlockedTimes_{0};
+};
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt
index a97d63ccd5b..58b480478ef 100644
--- a/velox/exec/tests/CMakeLists.txt
+++ b/velox/exec/tests/CMakeLists.txt
@@ -148,6 +148,7 @@ set(
   AssignUniqueIdTest.cpp
   FilterProjectTest.cpp
   AsyncConnectorTest.cpp
+  OptimizedPartitionedOutputTest.cpp
 )
 
 set(
diff --git a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
new file mode 100644
index 00000000000..86475cd0f41
--- /dev/null
+++ b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <future>
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "velox/common/memory/ByteStream.h"
+#include "velox/exec/HashPartitionFunction.h"
+#include "velox/exec/OptimizedPartitionedOutput.h"
+#include "velox/exec/Task.h"
+#include "velox/exec/tests/utils/OperatorTestBase.h"
+#include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/exec/tests/utils/QueryAssertions.h"
+#include "velox/serializers/PrestoSerializer.h"
+
+namespace facebook::velox::exec::test {
+
+/// How null values are distributed in value columns.
+enum class NullMode {
+  kNoNull, // no null values
+  kPartialNull, // row i is null if i % 2 == 0
+  kAllNull, // all values are null
+};
+
+/// Describes one parameterized test configuration.
+struct TestParam {
+  /// Short lowercase name used as the gtest parameter suffix.
+  std::string name;
+  /// Element type for value columns. Ignored when numValueCols == 0.
+  TypePtr valueType;
+  /// Number of partition-key columns (all INTEGER).
+  int numPartitionCols;
+  /// Number of value columns of valueType.
+  int numValueCols;
+  /// Null pattern applied to value columns.
+  NullMode nullMode;
+};
+
+/// Returns the full set of TestParam combinations:
+///   - numValueCols==0: 1 entry per numPartitionCols (type/nullMode irrelevant)
+///   - numValueCols∈{1,256}: all 4 types × 2 pk counts × 3 null modes
+std::vector<TestParam> testParams() {
+  std::vector<TestParam> params;
+
+  const std::vector<std::pair<std::string, TypePtr>> types = {
+      {"bool", BOOLEAN()},
+      {"tinyint", TINYINT()},
+      {"bigint", BIGINT()},
+      {"hugeint", HUGEINT()},
+  };
+
+  const std::vector<std::pair<std::string, NullMode>> nullModes = {
+      {"no_null", NullMode::kNoNull},
+      {"partial_null", NullMode::kPartialNull},
+      {"all_null", NullMode::kAllNull},
+  };
+
+  // Zero value columns: type and null mode do not affect test behavior.
+  for (int numPk : {1, 4}) {
+    params.push_back({
+        .name = "pk" + std::to_string(numPk) + "_val0",
+        .valueType = BIGINT(),
+        .numPartitionCols = numPk,
+        .numValueCols = 0,
+        .nullMode = NullMode::kNoNull,
+    });
+  }
+
+  // One and many value columns: all type × pk-count × null-mode combinations.
+  for (int numVal : {1, 256}) {
+    for (const auto& [typeName, type] : types) {
+      for (int numPk : {1, 4}) {
+        for (const auto& [nullName, nullMode] : nullModes) {
+          params.push_back({
+              .name = "pk" + std::to_string(numPk) + "_val" +
+                  std::to_string(numVal) + "_" + typeName + "_" + nullName,
+              .valueType = type,
+              .numPartitionCols = numPk,
+              .numValueCols = numVal,
+              .nullMode = nullMode,
+          });
+        }
+      }
+    }
+  }
+
+  return params;
+}
+
+/// Collected output from a single run of runPartitionedOutput().
+struct PartitionedOutputResult {
+  // Declared first so it is destroyed last: the IOBufs in pages reference the
+  // task's memory pool, so the task must outlive all the pages.
+  std::shared_ptr<Task> task;
+  /// Serialized output pages per partition, indexed by partition ID.
+  std::vector<std::vector<std::unique_ptr<folly::IOBuf>>> pages;
+  /// Number of pages received by each partition.
+  std::vector<size_t> pageCounts;
+  /// Total rows deserialized from each partition's pages.
+  std::vector<int64_t> rowCounts;
+  /// Number of partitions that received at least one page.
+  int numNonEmptyPartitions{0};
+  /// Sum of operator's numAppends runtime stat.
+  int64_t numAppends{0};
+  /// Sum of operator's numFlushes runtime stat.
+  int64_t numFlushes{0};
+  /// Sum of operator's numBlockedTimes runtime stat.
+  int64_t numBlockedTimes{0};
+};
+
+/// Shared infrastructure for all OptimizedPartitionedOutput tests.
+class OptimizedPartitionedOutputTest : public OperatorTestBase {
+ protected:
+  std::shared_ptr<core::QueryCtx> createQueryContext(
+      std::unordered_map<std::string, std::string> config) {
+    config[core::QueryConfig::kOptimizedPartitionedOutputEnabled] = "true";
+    return core::QueryCtx::create(
+        executor_.get(), core::QueryConfig(std::move(config)));
+  }
+
+  /// Fetches one batch of serialized pages from the output buffer for the given
+  /// destination. Returns the pages via a promise/future callback.
+  std::vector<std::unique_ptr<folly::IOBuf>>
+  getData(const std::string& taskId, int destination, int64_t sequence) {
+    auto [promise, semiFuture] = folly::makePromiseContract<
+        std::vector<std::unique_ptr<folly::IOBuf>>>();
+    VELOX_CHECK(bufferManager_->getData(
+        taskId,
+        destination,
+        OptimizedPartitionedOutput::kMinDestinationSize,
+        sequence,
+        [result = std::make_shared<
+             folly::Promise<std::vector<std::unique_ptr<folly::IOBuf>>>>(
+             std::move(promise))](
+            std::vector<std::unique_ptr<folly::IOBuf>> pages,
+            int64_t /*sequence*/,
+            std::vector<int64_t> /*remainingBytes*/) {
+          result->setValue(std::move(pages));
+        }));
+    auto future = std::move(semiFuture).via(executor_.get());
+    future.wait(std::chrono::seconds{10});
+    VELOX_CHECK(future.isReady());
+    return std::move(future).value();
+  }
+
+  /// Drains all pages for a destination until the null sentinel is received.
+  std::vector<std::unique_ptr<folly::IOBuf>> getAllData(
+      const std::string& taskId,
+      int destination) {
+    std::vector<std::unique_ptr<folly::IOBuf>> result;
+    int attempts = 0;
+    bool done = false;
+    while (!done) {
+      VELOX_CHECK_LT(++attempts, 10'000);
+      auto pages = getData(taskId, destination, result.size());
+      for (auto& page : pages) {
+        if (page) {
+          result.push_back(std::move(page));
+        } else {
+          bufferManager_->deleteResults(taskId, destination);
+          done = true;
+          break;
+        }
+      }
+    }
+    return result;
+  }
+
+  /// Deserializes a single Presto-serialized IOBuf page into a RowVector.
+  RowVectorPtr deserializePage(
+      const folly::IOBuf* iobuf,
+      const RowTypePtr& rowType) {
+    auto byteRanges = byteRangesFromIOBuf(const_cast<folly::IOBuf*>(iobuf));
+    auto byteStream =
+        std::make_unique<BufferInputStream>(std::move(byteRanges));
+    serializer::presto::PrestoVectorSerde serde;
+    RowVectorPtr result;
+    serde.deserialize(byteStream.get(), pool(), rowType, &result, 0, nullptr);
+    return result;
+  }
+
+  /// Deserializes and concatenates all pages for one partition into a single
+  /// RowVector. Returns an empty RowVector when pages is empty.
+  RowVectorPtr concatPages(
+      const std::vector<std::unique_ptr<folly::IOBuf>>& pages,
+      const RowTypePtr& rowType) {
+    RowVectorPtr result;
+    for (const auto& iobuf : pages) {
+      auto page = deserializePage(iobuf.get(), rowType);
+      if (!result) {
+        result = page;
+      } else {
+        result->append(page.get());
+      }
+    }
+    if (!result) {
+      result = std::static_pointer_cast<RowVector>(
+          BaseVector::create(rowType, 0, pool()));
+    }
+    return result;
+  }
+
+  int64_t getIntRuntimeStat(Task* task, const std::string& statName) {
+    const auto taskStats = task->taskStats();
+    const auto& runtimeStats =
+        taskStats.pipelineStats[0].operatorStats.back().runtimeStats;
+    auto it = runtimeStats.find(statName);
+    return it != runtimeStats.end() ? it->second.sum : 0;
+  }
+
+  /// Builds a plan from inputBatches, creates and starts a task, drains all
+  /// numPartitions destinations concurrently, waits for task completion, and
+  /// returns the collected pages, per-partition row counts, and operator
+  /// runtime stats. extraConfig is merged into the query config on top of the
+  /// OptimizedPartitionedOutput enable flag.
+  PartitionedOutputResult runPartitionedOutput(
+      const std::string& taskId,
+      const std::vector<RowVectorPtr>& inputBatches,
+      const std::vector<std::string>& partitionKeys,
+      int numPartitions,
+      std::unordered_map<std::string, std::string> extraConfig = {},
+      std::chrono::seconds timeout = std::chrono::seconds{30}) {
+    VELOX_CHECK(!inputBatches.empty());
+    const auto rowType =
+        std::dynamic_pointer_cast<const RowType>(inputBatches[0]->type());
+
+    auto plan = PlanBuilder()
+                    .values(inputBatches)
+                    .partitionedOutput(partitionKeys, numPartitions)
+                    .planNode();
+
+    auto task = Task::create(
+        taskId,
+        core::PlanFragment{plan},
+        0,
+        createQueryContext(std::move(extraConfig)),
+        Task::ExecutionMode::kParallel);
+    task->start(1);
+
+    // Drain all partitions concurrently to avoid deadlock with the driver.
+    std::vector<std::future<std::vector<std::unique_ptr<folly::IOBuf>>>>
+        futures;
+    futures.reserve(numPartitions);
+    for (int p = 0; p < numPartitions; ++p) {
+      futures.push_back(std::async(std::launch::async, [&, p] {
+        return getAllData(taskId, p);
+      }));
+    }
+
+    const auto taskWaitUs =
+        std::chrono::duration_cast<std::chrono::microseconds>(timeout).count();
+    EXPECT_TRUE(waitForTaskCompletion(task.get(), taskWaitUs));
+
+    PartitionedOutputResult result;
+    result.pages.resize(numPartitions);
+    result.pageCounts.resize(numPartitions, 0);
+    result.rowCounts.resize(numPartitions, 0);
+
+    for (int p = 0; p < numPartitions; ++p) {
+      result.pages[p] = futures[p].get();
+      result.pageCounts[p] = result.pages[p].size();
+      if (result.pageCounts[p] > 0) {
+        ++result.numNonEmptyPartitions;
+      }
+      result.rowCounts[p] = concatPages(result.pages[p], rowType)->size();
+    }
+
+    result.numAppends = getIntRuntimeStat(task.get(), "numAppends");
+    result.numFlushes = getIntRuntimeStat(task.get(), "numFlushes");
+    result.numBlockedTimes = getIntRuntimeStat(task.get(), "numBlockedTimes");
+    result.task = task;
+
+    return result;
+  }
+
+ private:
+  const std::shared_ptr<OutputBufferManager> bufferManager_{
+      OutputBufferManager::getInstanceRef()};
+};
+
+// ─── Parameterized fixture ───────────────────────────────────────────────────
+
+/// Parameterized fixture that exercises every TestParam combination.
+class OptimizedPartitionedOutputParamTest
+    : public OptimizedPartitionedOutputTest,
+      public ::testing::WithParamInterface<TestParam> {
+ protected:
+  const TestParam& param() const {
+    return GetParam();
+  }
+
+  /// Names for pk columns: ["p1"] or ["p1","p2","p3","p4"].
+  std::vector<std::string> pkColNames() const {
+    std::vector<std::string> names;
+    for (int i = 0; i < param().numPartitionCols; ++i) {
+      names.push_back("p" + std::to_string(i + 1));
+    }
+    return names;
+  }
+
+  /// Names for value columns: ["v0", ..., "v{N-1}"].
+  std::vector<std::string> valueColNames() const {
+    std::vector<std::string> names;
+    for (int i = 0; i < param().numValueCols; ++i) {
+      names.push_back("v" + std::to_string(i));
+    }
+    return names;
+  }
+
+  /// Full input ROW type: pk cols (INTEGER) followed by value cols.
+  RowTypePtr inputType() const {
+    std::vector<std::string> names = pkColNames();
+    std::vector<TypePtr> types(param().numPartitionCols, INTEGER());
+    for (const auto& name : valueColNames()) {
+      names.push_back(name);
+      types.push_back(param().valueType);
+    }
+    return ROW(std::move(names), std::move(types));
+  }
+
+  /// Channel indices of the pk columns within the input type.
+  std::vector<column_index_t> pkChannels() const {
+    std::vector<column_index_t> channels(param().numPartitionCols);
+    std::iota(channels.begin(), channels.end(), 0);
+    return channels;
+  }
+
+  /// Returns true if row i should be null in value columns for the current
+  /// null mode.
+  bool isNull(int rowIdx) const {
+    switch (param().nullMode) {
+      case NullMode::kNoNull:
+        return false;
+      case NullMode::kAllNull:
+        return true;
+      case NullMode::kPartialNull:
+        return rowIdx % 2 == 0;
+    }
+    VELOX_UNREACHABLE();
+  }
+
+  /// Creates a flat vector of the param's value type with random values and
+  /// nulls applied according to nullMode.
+  VectorPtr makeRandomValueVector(int numRows, std::mt19937_64& rng) {
+    auto isNullFn = [this](vector_size_t i) -> bool { return isNull(i); };
+
+    switch (param().valueType->kind()) {
+      case TypeKind::BOOLEAN:
+        return vectorMaker_.flatVector<bool>(
+            numRows,
+            [&](auto /*i*/) -> bool { return rng() % 2 == 0; },
+            isNullFn);
+      case TypeKind::TINYINT:
+        return vectorMaker_.flatVector<int8_t>(
+            numRows,
+            [&](auto /*i*/) -> int8_t { return static_cast<int8_t>(rng()); },
+            isNullFn);
+      case TypeKind::BIGINT:
+        return vectorMaker_.flatVector<int64_t>(
+            numRows,
+            [&](auto /*i*/) -> int64_t { return static_cast<int64_t>(rng()); },
+            isNullFn);
+      case TypeKind::HUGEINT:
+        return vectorMaker_.flatVector<int128_t>(
+            numRows,
+            [&](auto /*i*/) -> int128_t {
+              int64_t hi = static_cast<int64_t>(rng());
+              uint64_t lo = rng();
+              return (static_cast<int128_t>(hi) << 64) |
+                  static_cast<int128_t>(lo);
+            },
+            isNullFn);
+      default:
+        VELOX_UNREACHABLE(
+            "Unsupported value type: {}", param().valueType->toString());
+    }
+  }
+
+  /// Builds one input RowVector. p0Values holds the first pk column; each
+  /// subsequent pk column i is p0 + i. Value columns are filled with
+  /// independent random data drawn from rng.
+  RowVectorPtr makeInputBatch(
+      const std::vector<int32_t>& p0Values,
+      std::mt19937_64& rng) {
+    const int numRows = p0Values.size();
+    std::vector<std::string> names;
+    std::vector<VectorPtr> vecs;
+
+    // pk columns
+    for (int k = 0; k < param().numPartitionCols; ++k) {
+      names.push_back("p" + std::to_string(k + 1));
+      vecs.push_back(vectorMaker_.flatVector<int32_t>(
+          numRows, [&, k](auto i) { return p0Values[i] + k; }));
+    }
+
+    // value columns
+    for (int v = 0; v < param().numValueCols; ++v) {
+      names.push_back("v" + std::to_string(v));
+      vecs.push_back(makeRandomValueVector(numRows, rng));
+    }
+
+    return makeRowVector(names, vecs);
+  }
+
+  /// Sorts a vector by value for order-independent comparison. Returns a
+  /// dictionary vector with rows sorted in ascending order.
+  VectorPtr canonicalize(const VectorPtr& vector) {
+    const auto numRows = vector->size();
+    auto indices = makeIndices(numRows, [](auto i) { return i; });
+    auto* data = indices->asMutable<vector_size_t>();
+    std::stable_sort(data, data + numRows, [&](auto a, auto b) {
+      return vector->compare(vector.get(), a, b) < 0;
+    });
+    return BaseVector::wrapInDictionary(nullptr, indices, numRows, vector);
+  }
+
+  /// Builds a RowVector by gathering rows from inputBatches at the given
+  /// (batchIdx, rowIdx) positions. Used to construct the per-partition expected
+  /// RowVector.
+  RowVectorPtr gatherRows(
+      const std::vector<RowVectorPtr>& batches,
+      const std::vector<std::pair<int, int>>& rowList,
+      const RowTypePtr& rowType) {
+    const auto numRows = static_cast<vector_size_t>(rowList.size());
+    auto result = std::static_pointer_cast<RowVector>(
+        BaseVector::create(rowType, numRows, pool()));
+    for (vector_size_t r = 0; r < numRows; ++r) {
+      result->copy(batches[rowList[r].first].get(), r, rowList[r].second, 1);
+    }
+    return result;
+  }
+
+  /// Verifies that the deserialized pages for each partition exactly match the
+  /// rows from inputBatches that were routed to that partition. Both expected
+  /// and actual rows are sorted (canonicalized) before comparison to allow
+  /// order-independent matching.
+  void verifyDataIntegrity(
+      const std::vector<RowVectorPtr>& inputBatches,
+      const std::vector<std::vector<std::unique_ptr<folly::IOBuf>>>& allPages,
+      int numPartitions) {
+    // Compute expected per-partition row list using the same hash function as
+    // the operator.
+    auto partitionFn = std::make_unique<HashPartitionFunction>(
+        false, numPartitions, inputType(), pkChannels());
+
+    std::vector<std::vector<std::pair<int, int>>> expectedRows(numPartitions);
+    for (int batchIdx = 0; batchIdx < static_cast<int>(inputBatches.size());
+         ++batchIdx) {
+      std::vector<uint32_t> assignments(inputBatches[batchIdx]->size());
+      partitionFn->partition(*inputBatches[batchIdx], assignments);
+      for (int rowIdx = 0; rowIdx < static_cast<int>(assignments.size());
+           ++rowIdx) {
+        expectedRows[assignments[rowIdx]].emplace_back(batchIdx, rowIdx);
+      }
+    }
+
+    const auto rowType = inputType();
+    int64_t totalRows = 0;
+
+    for (int p = 0; p < numPartitions; ++p) {
+      auto expected = gatherRows(inputBatches, expectedRows[p], rowType);
+      auto actual = concatPages(allPages[p], rowType);
+
+      totalRows += expected->size();
+      ASSERT_EQ(expected->size(), actual->size())
+          << "partition " << p << " row count mismatch";
+
+      // Sort both vectors before comparing to allow order-independent matching.
+      auto expectedSorted = canonicalize(expected);
+      auto actualSorted = canonicalize(actual);
+      velox::test::assertEqualVectors(expectedSorted, actualSorted);
+    }
+
+    int64_t sentRows = 0;
+    for (const auto& batch : inputBatches) {
+      sentRows += batch->size();
+    }
+    EXPECT_EQ(totalRows, sentRows);
+  }
+};
+
+// ─── singleFlush ─────────────────────────────────────────────────────────────
+
+// Sends one batch into a large-buffer operator. All data is buffered without
+// triggering an intermediate flush; the final noMoreInput flush serializes
+// everything once. Verifies numFlushes==1, numBlockedTimes==0, and that every
+// deserialized row matches its source.
+TEST_P(OptimizedPartitionedOutputParamTest, singleFlush) {
+  constexpr int kNumPartitions = 4;
+  // One row per partition key, so every partition gets data.
+  std::vector<int32_t> p0Values;
+  for (int i = 0; i < kNumPartitions; ++i) {
+    p0Values.push_back(i);
+  }
+
+  std::mt19937_64 rng(42);
+  const std::vector<RowVectorPtr> inputBatches = {
+      makeInputBatch(p0Values, rng)};
+
+  auto result = runPartitionedOutput(
+      "local://test-single-flush-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+  EXPECT_EQ(result.numAppends, 1);
+  EXPECT_EQ(result.numFlushes, 1);
+  EXPECT_EQ(result.numBlockedTimes, 0);
+}
+
+// ─── multipleFlushes ─────────────────────────────────────────────────────────
+
+// Sends multiple batches through a 1-byte serializer ceiling so each addInput
+// triggers its own flush. A 10-byte OutputBuffer ceiling forces blocking.
+// Concurrent consumers drain each partition so the driver can unblock.
+// Verifies numFlushes==kBatches, numBlockedTimes>=1, and full data integrity.
+TEST_P(OptimizedPartitionedOutputParamTest, multipleFlushes) {
+  constexpr int kNumPartitions = 4;
+  constexpr int kBatches = 10;
+
+  // For wide schemas, reduce rows per batch so each batch stays small.
+  const int kRowsPerBatch = param().numValueCols >= 64 ? 2 : kNumPartitions;
+
+  std::vector<int32_t> p0Values(kRowsPerBatch);
+  for (int i = 0; i < kRowsPerBatch; ++i) {
+    p0Values[i] = i % kNumPartitions;
+  }
+  std::mt19937_64 rng(42);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-multiple-flushes-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions,
+      // 1-byte serializer ceiling flushes before every addInput.
+      // 10-byte OutputBuffer ceiling forces blocking on every enqueue.
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize, "1"},
+       {core::QueryConfig::kMaxOutputBufferSize, "10"}},
+      std::chrono::seconds{30});
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+  EXPECT_EQ(result.numAppends, kBatches);
+  EXPECT_EQ(result.numFlushes, kBatches);
+  EXPECT_EQ(result.numBlockedTimes, kBatches);
+}
+
+// ─── uniformDistribution ─────────────────────────────────────────────────────
+
+// Sends many batches with p1 cycling through all partition keys so every
+// partition receives rows. Uses the default buffer size (no intermediate
+// flush). Verifies that all partitions are non-empty and that data integrity
+// holds across all rows.
+TEST_P(OptimizedPartitionedOutputParamTest, uniformDistribution) {
+  constexpr int kNumPartitions = 4;
+  constexpr int kBatches = 10;
+
+  std::mt19937_64 rng(123);
+  // Use enough distinct p1 values across a wide range so all partitions receive
+  // rows regardless of how the hash distributes them. With 50 distinct p1
+  // values and 4 partitions the probability of any partition being empty is <
+  // 1e-6.
+  constexpr int kRowsPerBatch = 50;
+  std::uniform_int_distribution<int32_t> dist(0, 999);
+
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    std::vector<int32_t> p0Values(kRowsPerBatch);
+    for (auto& v : p0Values) {
+      v = dist(rng);
+    }
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-uniform-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // With 50 distinct p1 values per batch and 4 partitions, every partition must
+  // receive rows (probability of any bucket being empty is < 1e-6).
+  EXPECT_EQ(result.numNonEmptyPartitions, kNumPartitions);
+}
+
+// ─── skewed distributions
+// ──────────────────────────────────────────────────────
+
+// Sends batches with 6 distinct key values whose frequencies decrease by
+// roughly 2x per step, so non-empty partitions end up with very different row
+// counts. Because 6 < 8 some partitions stay empty; because 6 > 8/2 most
+// partitions receive rows. This sits between uniformDistribution (all full)
+// and skewedDistribution (at most 2 of 64 filled).
+TEST_P(OptimizedPartitionedOutputParamTest, moderateSkew) {
+  constexpr int kNumPartitions = 8;
+  constexpr int kBatches = 5;
+
+  // Key i appears 2^(5-i) times per batch: key 0 → 32 rows, key 1 → 16,
+  // key 2 → 8, key 3 → 4, key 4 → 2, key 5 → 1. Total: 63 rows per batch.
+  std::vector<int32_t> keyPattern;
+  for (int key = 0; key < 6; ++key) {
+    const int count = 1 << (5 - key); // 32, 16, 8, 4, 2, 1
+    for (int j = 0; j < count; ++j) {
+      keyPattern.push_back(key);
+    }
+  }
+
+  std::mt19937_64 rng(55);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    auto p0Values = keyPattern;
+    std::shuffle(p0Values.begin(), p0Values.end(), rng);
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-moderate-skew-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // 6 distinct keys → at most 6 non-empty partitions; 6 < 8 → at least one
+  // empty partition.
+  EXPECT_LE(result.numNonEmptyPartitions, 6);
+
+  // Verify a wide spread in per-partition row counts: the heaviest non-empty
+  // partition must have at least 2x the average non-empty partition size.
+  // This remains stable even when several low-frequency keys hash to the same
+  // bucket, unlike a comparison against the minimum non-empty partition.
+  int64_t maxRows = 0;
+  int64_t totalNonZeroRows = 0;
+  int64_t numNonZeroPartitions = 0;
+  for (int p = 0; p < kNumPartitions; ++p) {
+    if (result.rowCounts[p] > 0) {
+      maxRows = std::max(maxRows, result.rowCounts[p]);
+      totalNonZeroRows += result.rowCounts[p];
+      ++numNonZeroPartitions;
+    }
+  }
+  ASSERT_GT(numNonZeroPartitions, 0);
+  EXPECT_GE(maxRows * numNonZeroPartitions, totalNonZeroRows * 2);
+}
+
+// Sends many batches with p1 restricted to {0, 1} into a 64-partition
+// operator. At most 2 of the 64 partitions will receive any rows; the rest
+// must be empty. Verifies data integrity and the empty-partition invariant.
+TEST_P(OptimizedPartitionedOutputParamTest, twoDestinations) {
+  constexpr int kNumPartitions = 64;
+  constexpr int kBatches = 10;
+  constexpr int kRowsPerBatch = 4;
+
+  std::mt19937_64 rng(7);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    // p1 only takes values 0 and 1; at most 2 of 64 partitions receive rows.
+    std::vector<int32_t> p0Values(kRowsPerBatch);
+    for (int i = 0; i < kRowsPerBatch; ++i) {
+      p0Values[i] = i % 2;
+    }
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-skewed-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // p1 ∈ {0, 1}: at most 2 distinct hash buckets receive rows.
+  EXPECT_LE(result.numNonEmptyPartitions, 2);
+  EXPECT_GE(result.numNonEmptyPartitions, 1);
+}
+
+// Sends multiple batches where every row carries the same partition key value
+// so all rows hash to a single destination. Verifies that exactly one partition
+// receives all rows and the remaining partitions stay empty.
+TEST_P(OptimizedPartitionedOutputParamTest, singleDestination) {
+  constexpr int kNumPartitions = 8;
+  constexpr int kBatches = 5;
+  constexpr int kRowsPerBatch = 10;
+
+  // Every row has p1=0 (p2=1, p3=2, p4=3 for multi-pk params), so the hash is
+  // identical for every row and all rows land in one partition.
+  std::mt19937_64 rng(99);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    inputBatches.push_back(
+        makeInputBatch(std::vector<int32_t>(kRowsPerBatch, 0), rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-single-dest-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // All rows must land in exactly one partition.
+  EXPECT_EQ(result.numNonEmptyPartitions, 1);
+
+  // That one partition must hold every row from every batch.
+  const int64_t totalInputRows = static_cast<int64_t>(kBatches) * kRowsPerBatch;
+  for (int p = 0; p < kNumPartitions; ++p) {
+    if (result.rowCounts[p] > 0) {
+      EXPECT_EQ(result.rowCounts[p], totalInputRows) << "partition " << p;
+    }
+  }
+}
+
+// ─── instantiation ───────────────────────────────────────────────────────────
+
+INSTANTIATE_TEST_SUITE_P(
+    Params,
+    OptimizedPartitionedOutputParamTest,
+    ::testing::ValuesIn(testParams()),
+    [](const ::testing::TestParamInfo<TestParam>& info) {
+      return info.param.name;
+    });
+
+// ─── non-parameterized tests ─────────────────────────────────────────────────
+
+// Verifies that replicateNullsAndAny raises an error since it is not yet
+// supported by OptimizedPartitionedOutput.
+TEST_F(OptimizedPartitionedOutputTest, replicateNullsAndAnyUnsupported) {
+  auto input = makeRowVector(
+      {"p1", "v1"},
+      {makeNullableFlatVector<int32_t>({0, std::nullopt, 1}),
+       makeFlatVector<std::string>({"a", "b", "c"})});
+
+  auto plan =
+      PlanBuilder()
+          .values({input})
+          .partitionedOutput({"p1"}, 2, /*replicateNullsAndAny=*/true, {"v1"})
+          .planNode();
+
+  auto taskId = "local://test-replicate-nulls-unsupported-0";
+  auto task = Task::create(
+      taskId,
+      core::PlanFragment{plan},
+      0,
+      createQueryContext({}),
+      Task::ExecutionMode::kParallel);
+  task->start(1);
+
+  const auto taskWaitUs = std::chrono::duration_cast<std::chrono::microseconds>(
+                              std::chrono::seconds{10})
+                              .count();
+  ASSERT_TRUE(waitForTaskFailure(task.get(), taskWaitUs));
+  ASSERT_THAT(
+      task->errorMessage(),
+      testing::HasSubstr(
+          "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput"));
+}
+
+} // namespace facebook::velox::exec::test

From 211901c141f1b6b828e116eada52589eb40f3d09 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Fri, 10 Apr 2026 02:24:49 -0700
Subject: [PATCH 11/24] feat(PartitionedOutput): Add normal vs optimized
 comparison in ExchangeBenchmark

- Added normal vs optimized PartitionedOutput comparison by running each
  exchange case twice with kOptimizedPartitionedOutputEnabled=false/true.
- Added per-mode benchmark names:
  - exchange<Case>_normalPartitionedOutput
  - exchange<Case>_optimizedPartitionedOutput in ExchangeBenchmark.cpp.
- Refactored result printing into shared helpers and fixed output
  consistency in ExchangeBenchmark.cpp.
---
 velox/exec/benchmarks/ExchangeBenchmark.cpp | 184 ++++++++------------
 1 file changed, 75 insertions(+), 109 deletions(-)

diff --git a/velox/exec/benchmarks/ExchangeBenchmark.cpp b/velox/exec/benchmarks/ExchangeBenchmark.cpp
index 45689ccbf64..16ab2885caf 100644
--- a/velox/exec/benchmarks/ExchangeBenchmark.cpp
+++ b/velox/exec/benchmarks/ExchangeBenchmark.cpp
@@ -67,6 +67,12 @@ struct LocalPartitionWaitStats {
   std::vector<int64_t> wallMs;
 };
 
+struct ExchangeRunStats {
+  int64_t wallUs = 0;
+  PlanNodeStats partitionedOutputStats;
+  PlanNodeStats exchangeStats;
+};
+
 void sortByMax(std::vector<RuntimeMetric>& metrics) {
   std::sort(
       metrics.begin(),
@@ -88,6 +94,18 @@ void sortByAndPrintMax(
             << "\n Min: " << metrics.back().toString() << std::endl;
 }
 
+void printExchangeStats(
+    const std::string& datasetName,
+    const std::string& modeName,
+    const ExchangeRunStats& stats) {
+  std::cout << "-----------------------------" << datasetName << " ("
+            << modeName << ")-----------------------------" << std::endl;
+  std::cout << "Wall Time (ms): " << succinctMicros(stats.wallUs) << std::endl;
+  std::cout << "PartitionOutput: " << stats.partitionedOutputStats.toString()
+            << std::endl;
+  std::cout << "Exchange: " << stats.exchangeStats.toString() << std::endl;
+}
+
 class ExchangeBenchmark : public VectorTestBase {
  public:
   std::vector<RowVectorPtr> makeRows(
@@ -120,6 +138,7 @@ class ExchangeBenchmark : public VectorTestBase {
       std::vector<RowVectorPtr>& vectors,
       int32_t width,
       int32_t taskWidth,
+      bool useOptimizedPartitionedOutput,
       int64_t& wallUs,
       PlanNodeStats& partitionedOutputStats,
       PlanNodeStats& exchangeStats) {
@@ -373,7 +392,7 @@ int32_t ExchangeBenchmark::iteration_;
 
 std::unique_ptr<ExchangeBenchmark> bm;
 
-void runBenchmarks() {
+void runBenchmarks(bool optimizedPartitionedOutputEnabled = false) {
   std::vector<std::string> flatNames = {"c0"};
   std::vector<TypePtr> flatTypes = {BIGINT()};
   std::vector<TypePtr> typeSelection = {
@@ -438,75 +457,51 @@ void runBenchmarks() {
   std::vector<RowVectorPtr> struct1k(
       bm->makeRows(structType, 100, 1000, FLAGS_dict_pct));
 
-  int64_t flat10KWallUs;
-  PlanNodeStats partitionedOutputStatsFlat10K;
-  PlanNodeStats exchangeStatsFlat10K;
-  folly::addBenchmark(__FILE__, "exchangeFlat10k", [&]() {
-    bm->run(
-        flat10k,
-        FLAGS_width,
-        FLAGS_task_width,
-        flat10KWallUs,
-        partitionedOutputStatsFlat10K,
-        exchangeStatsFlat10K);
-    return 1;
-  });
-
-  int64_t flat50KWallUs;
-  PlanNodeStats partitionedOutputStatsFlat50;
-  PlanNodeStats exchangeStatsFlat50;
-  folly::addBenchmark(__FILE__, "exchangeFlat50", [&]() {
-    bm->run(
-        flat50,
-        FLAGS_width,
-        FLAGS_task_width,
-        flat50KWallUs,
-        partitionedOutputStatsFlat50,
-        exchangeStatsFlat50);
-    return 1;
-  });
-
-  int64_t deep10KWallUs;
-  PlanNodeStats partitionedOutputStatsDeep10K;
-  PlanNodeStats exchangeStatsDeep10K;
-  folly::addBenchmark(__FILE__, "exchangeDeep10k", [&]() {
-    bm->run(
-        deep10k,
-        FLAGS_width,
-        FLAGS_task_width,
-        deep10KWallUs,
-        partitionedOutputStatsDeep10K,
-        exchangeStatsDeep10K);
-    return 1;
-  });
-
-  int64_t deep50KWallUs;
-  PlanNodeStats partitionedOutputStatsDeep50;
-  PlanNodeStats exchangeStatsDeep50;
-  folly::addBenchmark(__FILE__, "exchangeDeep50", [&]() {
-    bm->run(
-        deep50,
-        FLAGS_width,
-        FLAGS_task_width,
-        deep50KWallUs,
-        partitionedOutputStatsDeep50,
-        exchangeStatsDeep50);
-    return 1;
-  });
-
-  int64_t stuct1KWallUs;
-  PlanNodeStats partitionedOutputStatsStruct1K;
-  PlanNodeStats exchangeStatsStruct1K;
-  folly::addBenchmark(__FILE__, "exchangeStruct1K", [&]() {
-    bm->run(
-        struct1k,
-        FLAGS_width,
-        FLAGS_task_width,
-        stuct1KWallUs,
-        partitionedOutputStatsStruct1K,
-        exchangeStatsStruct1K);
-    return 1;
-  });
+  std::vector<std::pair<std::string, std::vector<RowVectorPtr>*>> exchangeCases{
+      {"Flat10K", &flat10k},
+      {"Flat50", &flat50},
+      {"Deep10K", &deep10k},
+      {"Deep50", &deep50},
+      {"Struct1K", &struct1k}};
+
+  std::vector<ExchangeRunStats> normalPartitionedOutputStats(
+      exchangeCases.size());
+  std::vector<ExchangeRunStats> optimizedPartitionedOutputStats(
+      exchangeCases.size());
+
+  for (size_t i = 0; i < exchangeCases.size(); ++i) {
+    const auto& name = exchangeCases[i].first;
+    folly::addBenchmark(
+        __FILE__,
+        fmt::format("exchange{}_normalPartitionedOutput", name),
+        [&, i]() {
+          bm->run(
+              *exchangeCases[i].second,
+              FLAGS_width,
+              FLAGS_task_width,
+              false,
+              normalPartitionedOutputStats[i].wallUs,
+              normalPartitionedOutputStats[i].partitionedOutputStats,
+              normalPartitionedOutputStats[i].exchangeStats);
+          return 1;
+        });
+    if (optimizedPartitionedOutputEnabled) {
+      folly::addBenchmark(
+          __FILE__,
+          fmt::format("exchange{}_optimizedPartitionedOutput", name),
+          [&, i]() {
+            bm->run(
+                *exchangeCases[i].second,
+                FLAGS_width,
+                FLAGS_task_width,
+                true,
+                optimizedPartitionedOutputStats[i].wallUs,
+                optimizedPartitionedOutputStats[i].partitionedOutputStats,
+                optimizedPartitionedOutputStats[i].exchangeStats);
+            return 1;
+          });
+    }
+  }
 
   int64_t localPartitionWallUs;
   PlanNodeStats localPartitionStatsFlat10K;
@@ -524,45 +519,16 @@ void runBenchmarks() {
 
   folly::runBenchmarks();
 
-  std::cout
-      << "----------------------------------Flat10K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(flat10KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsFlat10K.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsFlat10K.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Flat50K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(flat50KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsFlat50.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsFlat10K.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Deep10K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(deep10KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsDeep10K.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsDeep10K.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Deep50K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(deep50KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsDeep50.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsDeep50.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Struct1K---------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(stuct1KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsStruct1K.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsStruct1K.toString() << std::endl;
+  for (size_t i = 0; i < exchangeCases.size(); ++i) {
+    printExchangeStats(
+        exchangeCases[i].first, "normal", normalPartitionedOutputStats[i]);
+    if (optimizedPartitionedOutputEnabled) {
+      printExchangeStats(
+          exchangeCases[i].first,
+          "optimized",
+          optimizedPartitionedOutputStats[i]);
+    }
+  }
 
   std::cout
       << "--------------------------------LocalFlat10K-------------------------------"

From e1e10b33bfea7ab39ca33255184d845be2afde9d Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Sat, 11 Apr 2026 00:56:38 -0700
Subject: [PATCH 12/24] refactor:(PartitionedOutput): Separate local partition
 exchange benchmark

Split the local partition exchange benchmark out of ExchangeBenchmark
into its own executable and CMake target, while keeping the local
benchmark logic and statistics reporting available in a dedicated binary.
---
 velox/exec/benchmarks/CMakeLists.txt          |  10 +
 velox/exec/benchmarks/ExchangeBenchmark.cpp   | 175 ----------
 .../benchmarks/LocalExchangeBenchmark.cpp     | 309 ++++++++++++++++++
 3 files changed, 319 insertions(+), 175 deletions(-)
 create mode 100644 velox/exec/benchmarks/LocalExchangeBenchmark.cpp

diff --git a/velox/exec/benchmarks/CMakeLists.txt b/velox/exec/benchmarks/CMakeLists.txt
index 7a721bf91a6..2f97200c904 100644
--- a/velox/exec/benchmarks/CMakeLists.txt
+++ b/velox/exec/benchmarks/CMakeLists.txt
@@ -40,6 +40,16 @@ target_link_libraries(
   Folly::follybenchmark
 )
 
+add_executable(velox_local_exchange_benchmark LocalExchangeBenchmark.cpp)
+
+target_link_libraries(
+  velox_local_exchange_benchmark
+  velox_exec
+  velox_exec_test_lib
+  velox_vector_test_lib
+  Folly::follybenchmark
+)
+
 add_executable(velox_merge_benchmark MergeBenchmark.cpp)
 
 target_link_libraries(
diff --git a/velox/exec/benchmarks/ExchangeBenchmark.cpp b/velox/exec/benchmarks/ExchangeBenchmark.cpp
index 16ab2885caf..3301b9e6e8d 100644
--- a/velox/exec/benchmarks/ExchangeBenchmark.cpp
+++ b/velox/exec/benchmarks/ExchangeBenchmark.cpp
@@ -32,13 +32,7 @@
 DEFINE_int32(width, 16, "Number of parties in shuffle");
 DEFINE_int32(task_width, 4, "Number of threads in each task in shuffle");
 
-DEFINE_int32(num_local_tasks, 8, "Number of concurrent local shuffles");
-DEFINE_int32(num_local_repeat, 8, "Number of repeats of local exchange query");
 DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch.");
-DEFINE_int64(
-    local_exchange_buffer_mb,
-    32,
-    "task-wide buffer in local exchange");
 DEFINE_int64(exchange_buffer_mb, 32, "task-wide buffer in remote exchange");
 DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary");
 // Add the following definitions to allow Clion runs
@@ -59,41 +53,12 @@ using namespace facebook::velox::test;
 
 namespace {
 
-struct LocalPartitionWaitStats {
-  int64_t totalProducerWaitMs = 0;
-  int64_t totalConsumerWaitMs = 0;
-  std::vector<RuntimeMetric> consumerWaitMs;
-  std::vector<RuntimeMetric> producerWaitMs;
-  std::vector<int64_t> wallMs;
-};
-
 struct ExchangeRunStats {
   int64_t wallUs = 0;
   PlanNodeStats partitionedOutputStats;
   PlanNodeStats exchangeStats;
 };
 
-void sortByMax(std::vector<RuntimeMetric>& metrics) {
-  std::sort(
-      metrics.begin(),
-      metrics.end(),
-      [](const RuntimeMetric& left, const RuntimeMetric& right) {
-        return left.max > right.max;
-      });
-}
-
-void sortByAndPrintMax(
-    const char* title,
-    int64_t total,
-    std::vector<RuntimeMetric>& metrics) {
-  sortByMax(metrics);
-  VELOX_CHECK(!metrics.empty());
-  std::cout << title << "\n Total " << succinctNanos(total)
-            << "\n Max: " << metrics.front().toString()
-            << "\n Median: " << metrics[metrics.size() / 2].toString()
-            << "\n Min: " << metrics.back().toString() << std::endl;
-}
-
 void printExchangeStats(
     const std::string& datasetName,
     const std::string& modeName,
@@ -243,106 +208,6 @@ class ExchangeBenchmark : public VectorTestBase {
     };
   }
 
-  void runLocal(
-      std::vector<RowVectorPtr>& vectors,
-      int32_t taskWidth,
-      int32_t numTasks,
-      int64_t& localPartitionWallUs,
-      PlanNodeStats& partitionedOutputStats,
-      LocalPartitionWaitStats& localPartitionWaitStats) {
-    assert(!vectors.empty());
-
-    core::PlanNodePtr plan;
-    core::PlanNodeId localPartitionId1;
-    core::PlanNodeId localPartitionId2;
-    std::vector<std::shared_ptr<Task>> tasks;
-    std::vector<std::thread> threads;
-
-    RowVectorPtr expected;
-
-    BENCHMARK_SUSPEND {
-      std::vector<std::string> aggregates = {"count(1)"};
-      auto& rowType = vectors[0]->type()->as<TypeKind::ROW>();
-      for (auto i = 1; i < rowType.size(); ++i) {
-        aggregates.push_back(fmt::format("checksum({})", rowType.nameOf(i)));
-      }
-
-      // plan: Agg/kSingle(4) <-- LocalPartition/Gather(3) <-- Agg/kGather(2)
-      // <-- LocalPartition/kRepartition(1) <-- Values(0)
-      plan = exec::test::PlanBuilder()
-                 .values(vectors, true)
-                 .localPartition({"c0"})
-                 .capturePlanNodeId(localPartitionId1)
-                 .singleAggregation({}, aggregates)
-                 .localPartition(std::vector<std::string>{})
-                 .capturePlanNodeId(localPartitionId2)
-                 .singleAggregation({}, {"sum(a0)"})
-                 .planNode();
-
-      threads.reserve(numTasks);
-      expected = makeRowVector({makeFlatVector<int64_t>(1, [&](auto /*row*/) {
-        return vectors.size() * vectors[0]->size() * taskWidth;
-      })});
-    };
-
-    auto startMicros = getCurrentTimeMicro();
-    std::mutex mutex;
-    for (int32_t i = 0; i < numTasks; ++i) {
-      threads.push_back(std::thread([&]() {
-        for (auto repeat = 0; repeat < FLAGS_num_local_repeat; ++repeat) {
-          auto task =
-              exec::test::AssertQueryBuilder(plan)
-                  .config(
-                      core::QueryConfig::kMaxLocalExchangeBufferSize,
-                      fmt::format("{}", FLAGS_local_exchange_buffer_mb << 20))
-                  .maxDrivers(taskWidth)
-                  .assertResults(expected);
-          {
-            std::lock_guard<std::mutex> l(mutex);
-            tasks.push_back(task);
-          }
-        }
-      }));
-    }
-    for (auto& thread : threads) {
-      thread.join();
-    }
-
-    BENCHMARK_SUSPEND {
-      localPartitionWallUs = getCurrentTimeMicro() - startMicros;
-
-      std::vector<core::PlanNodeId> localPartitionNodeIds{
-          localPartitionId1, localPartitionId2};
-
-      localPartitionWaitStats.totalProducerWaitMs = 0;
-      localPartitionWaitStats.totalConsumerWaitMs = 0;
-      for (const auto& task : tasks) {
-        auto taskStats = task->taskStats();
-        localPartitionWaitStats.wallMs.push_back(
-            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
-        auto planStats = toPlanStats(taskStats);
-
-        for (const auto& nodeId : localPartitionNodeIds) {
-          auto& taskLocalPartition1Stats = planStats.at(nodeId);
-          partitionedOutputStats += taskLocalPartition1Stats;
-
-          auto& taskLocalPartition1RuntimeStats =
-              taskLocalPartition1Stats.customStats;
-          localPartitionWaitStats.producerWaitMs.push_back(
-              taskLocalPartition1RuntimeStats
-                  ["blockedWaitForProducerWallNanos"]);
-          localPartitionWaitStats.consumerWaitMs.push_back(
-              taskLocalPartition1RuntimeStats
-                  ["blockedWaitForConsumerWallNanos"]);
-          localPartitionWaitStats.totalProducerWaitMs +=
-              localPartitionWaitStats.producerWaitMs.back().sum;
-          localPartitionWaitStats.totalConsumerWaitMs +=
-              localPartitionWaitStats.consumerWaitMs.back().sum;
-        }
-      }
-    };
-  }
-
  private:
   static constexpr int64_t kMaxMemory = 6UL << 30; // 6GB
 
@@ -503,20 +368,6 @@ void runBenchmarks(bool optimizedPartitionedOutputEnabled = false) {
     }
   }
 
-  int64_t localPartitionWallUs;
-  PlanNodeStats localPartitionStatsFlat10K;
-  LocalPartitionWaitStats localPartitionWaitStats;
-  folly::addBenchmark(__FILE__, "localFlat10k", [&]() {
-    bm->runLocal(
-        flat10k,
-        FLAGS_width,
-        FLAGS_num_local_tasks,
-        localPartitionWallUs,
-        localPartitionStatsFlat10K,
-        localPartitionWaitStats);
-    return 1;
-  });
-
   folly::runBenchmarks();
 
   for (size_t i = 0; i < exchangeCases.size(); ++i) {
@@ -529,32 +380,6 @@ void runBenchmarks(bool optimizedPartitionedOutputEnabled = false) {
           optimizedPartitionedOutputStats[i]);
     }
   }
-
-  std::cout
-      << "--------------------------------LocalFlat10K-------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << "\n Total: "
-            << succinctMicros(localPartitionWallUs)
-            << "\n Max: " << localPartitionWaitStats.wallMs.back()
-            << "\n Median: "
-            << localPartitionWaitStats
-                   .wallMs[localPartitionWaitStats.wallMs.size() / 2]
-            << "\n Min: " << localPartitionWaitStats.wallMs.front()
-            << std::endl;
-  std::cout << "LocalPartition: " << localPartitionStatsFlat10K.toString()
-            << std::endl;
-  sortByAndPrintMax(
-      "Producer Wait Time (ms)",
-      localPartitionWaitStats.totalProducerWaitMs,
-      localPartitionWaitStats.producerWaitMs);
-  sortByAndPrintMax(
-      "Consumer Wait Time (ms)",
-      localPartitionWaitStats.totalConsumerWaitMs,
-      localPartitionWaitStats.consumerWaitMs);
-  std::sort(
-      localPartitionWaitStats.wallMs.begin(),
-      localPartitionWaitStats.wallMs.end());
-  assert(!localPartitionWaitStats.wallMs.empty());
 }
 
 } // namespace
diff --git a/velox/exec/benchmarks/LocalExchangeBenchmark.cpp b/velox/exec/benchmarks/LocalExchangeBenchmark.cpp
new file mode 100644
index 00000000000..50b7637fd92
--- /dev/null
+++ b/velox/exec/benchmarks/LocalExchangeBenchmark.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include <algorithm>
+#include <mutex>
+#include <thread>
+
+#include "velox/core/QueryConfig.h"
+#include "velox/dwio/common/tests/utils/BatchMaker.h"
+#include "velox/exec/PlanNodeStats.h"
+#include "velox/exec/tests/utils/AssertQueryBuilder.h"
+#include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h"
+#include "velox/functions/prestosql/registration/RegistrationFunctions.h"
+#include "velox/parse/TypeResolver.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+DEFINE_int32(width, 16, "Number of drivers in each local exchange task");
+DEFINE_int32(num_local_tasks, 8, "Number of concurrent local shuffles");
+DEFINE_int32(num_local_repeat, 8, "Number of repeats of local exchange query");
+DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch.");
+DEFINE_int64(
+    local_exchange_buffer_mb,
+    32,
+    "task-wide buffer in local exchange");
+DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary");
+// Add the following definitions to allow Clion runs
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+struct LocalPartitionWaitStats {
+  int64_t totalProducerWaitMs = 0;
+  int64_t totalConsumerWaitMs = 0;
+  std::vector<RuntimeMetric> consumerWaitMs;
+  std::vector<RuntimeMetric> producerWaitMs;
+  std::vector<int64_t> wallMs;
+};
+
+void sortByMax(std::vector<RuntimeMetric>& metrics) {
+  std::sort(
+      metrics.begin(),
+      metrics.end(),
+      [](const RuntimeMetric& left, const RuntimeMetric& right) {
+        return left.max > right.max;
+      });
+}
+
+void sortByAndPrintMax(
+    const char* title,
+    int64_t total,
+    std::vector<RuntimeMetric>& metrics) {
+  sortByMax(metrics);
+  VELOX_CHECK(!metrics.empty());
+  std::cout << title << "\n Total " << succinctNanos(total)
+            << "\n Max: " << metrics.front().toString()
+            << "\n Median: " << metrics[metrics.size() / 2].toString()
+            << "\n Min: " << metrics.back().toString() << std::endl;
+}
+
+class LocalExchangeBenchmark : public VectorTestBase {
+ public:
+  std::vector<RowVectorPtr> makeRows(
+      RowTypePtr type,
+      int32_t numVectors,
+      int32_t rowsPerVector,
+      int32_t dictPct = 0) {
+    std::vector<RowVectorPtr> vectors;
+    BufferPtr indices;
+    for (int32_t i = 0; i < numVectors; ++i) {
+      auto vector = std::dynamic_pointer_cast<RowVector>(
+          BatchMaker::createBatch(type, rowsPerVector, *pool_));
+
+      auto width = vector->childrenSize();
+      for (auto child = 0; child < width; ++child) {
+        if (100 * child / width > dictPct) {
+          if (!indices) {
+            indices = makeIndices(vector->size(), [&](auto i) { return i; });
+          }
+          vector->childAt(child) = BaseVector::wrapInDictionary(
+              nullptr, indices, vector->size(), vector->childAt(child));
+        }
+      }
+      vectors.push_back(vector);
+    }
+    return vectors;
+  }
+
+  void runLocal(
+      std::vector<RowVectorPtr>& vectors,
+      int32_t taskWidth,
+      int32_t numTasks,
+      int64_t& localPartitionWallUs,
+      PlanNodeStats& partitionedOutputStats,
+      LocalPartitionWaitStats& localPartitionWaitStats) {
+    VELOX_CHECK(!vectors.empty());
+
+    core::PlanNodePtr plan;
+    core::PlanNodeId localPartitionId1;
+    core::PlanNodeId localPartitionId2;
+    std::vector<std::shared_ptr<Task>> tasks;
+    std::vector<std::thread> threads;
+
+    RowVectorPtr expected;
+
+    BENCHMARK_SUSPEND {
+      std::vector<std::string> aggregates = {"count(1)"};
+      auto& rowType = vectors[0]->type()->as<TypeKind::ROW>();
+      for (auto i = 1; i < rowType.size(); ++i) {
+        aggregates.push_back(fmt::format("checksum({})", rowType.nameOf(i)));
+      }
+
+      // plan: Agg/kSingle(4) <-- LocalPartition/Gather(3) <-- Agg/kGather(2)
+      // <-- LocalPartition/kRepartition(1) <-- Values(0)
+      plan = exec::test::PlanBuilder()
+                 .values(vectors, true)
+                 .localPartition({"c0"})
+                 .capturePlanNodeId(localPartitionId1)
+                 .singleAggregation({}, aggregates)
+                 .localPartition(std::vector<std::string>{})
+                 .capturePlanNodeId(localPartitionId2)
+                 .singleAggregation({}, {"sum(a0)"})
+                 .planNode();
+
+      threads.reserve(numTasks);
+      expected = makeRowVector({makeFlatVector<int64_t>(1, [&](auto /*row*/) {
+        return vectors.size() * vectors[0]->size() * taskWidth;
+      })});
+    };
+
+    const auto startMicros = getCurrentTimeMicro();
+    std::mutex mutex;
+    for (int32_t i = 0; i < numTasks; ++i) {
+      threads.push_back(std::thread([&]() {
+        for (auto repeat = 0; repeat < FLAGS_num_local_repeat; ++repeat) {
+          auto task =
+              exec::test::AssertQueryBuilder(plan)
+                  .config(
+                      core::QueryConfig::kMaxLocalExchangeBufferSize,
+                      fmt::format("{}", FLAGS_local_exchange_buffer_mb << 20))
+                  .maxDrivers(taskWidth)
+                  .assertResults(expected);
+          {
+            std::lock_guard<std::mutex> l(mutex);
+            tasks.push_back(task);
+          }
+        }
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    BENCHMARK_SUSPEND {
+      localPartitionWallUs = getCurrentTimeMicro() - startMicros;
+
+      std::vector<core::PlanNodeId> localPartitionNodeIds{
+          localPartitionId1, localPartitionId2};
+
+      localPartitionWaitStats.totalProducerWaitMs = 0;
+      localPartitionWaitStats.totalConsumerWaitMs = 0;
+      for (const auto& task : tasks) {
+        const auto taskStats = task->taskStats();
+        localPartitionWaitStats.wallMs.push_back(
+            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
+        const auto planStats = toPlanStats(taskStats);
+
+        for (const auto& nodeId : localPartitionNodeIds) {
+          const auto planStatsIt = planStats.find(nodeId);
+          if (planStatsIt == planStats.end()) {
+            continue;
+          }
+          const auto& taskLocalPartitionStats = planStatsIt->second;
+          partitionedOutputStats += taskLocalPartitionStats;
+
+          const auto& runtimeStats = taskLocalPartitionStats.customStats;
+          const auto producerWaitIt =
+              runtimeStats.find("blockedWaitForProducerWallNanos");
+          const auto consumerWaitIt =
+              runtimeStats.find("blockedWaitForConsumerWallNanos");
+          const RuntimeMetric producerWait =
+              producerWaitIt == runtimeStats.end() ? RuntimeMetric{}
+                                                   : producerWaitIt->second;
+          const RuntimeMetric consumerWait =
+              consumerWaitIt == runtimeStats.end() ? RuntimeMetric{}
+                                                   : consumerWaitIt->second;
+          localPartitionWaitStats.producerWaitMs.push_back(producerWait);
+          localPartitionWaitStats.consumerWaitMs.push_back(consumerWait);
+          localPartitionWaitStats.totalProducerWaitMs +=
+              localPartitionWaitStats.producerWaitMs.back().sum;
+          localPartitionWaitStats.totalConsumerWaitMs +=
+              localPartitionWaitStats.consumerWaitMs.back().sum;
+        }
+      }
+    };
+  }
+};
+
+std::unique_ptr<LocalExchangeBenchmark> bm;
+
+void runBenchmarks() {
+  std::vector<std::string> flatNames = {"c0"};
+  std::vector<TypePtr> flatTypes = {BIGINT()};
+  std::vector<TypePtr> typeSelection = {
+      BOOLEAN(),
+      TINYINT(),
+      DECIMAL(20, 3),
+      INTEGER(),
+      BIGINT(),
+      REAL(),
+      DECIMAL(10, 2),
+      DOUBLE(),
+      VARCHAR()};
+
+  int64_t flatSize = 0;
+  // Add enough columns of different types to make a 10K row batch be
+  // flat_batch_mb in flat size.
+  while (flatSize * 10000 < static_cast<int64_t>(FLAGS_flat_batch_mb) << 20) {
+    flatNames.push_back(fmt::format("c{}", flatNames.size()));
+    flatTypes.push_back(typeSelection[flatTypes.size() % typeSelection.size()]);
+    if (flatTypes.back()->isFixedWidth()) {
+      flatSize += flatTypes.back()->cppSizeInBytes();
+    } else {
+      flatSize += 20;
+    }
+  }
+  auto flatType = ROW(std::move(flatNames), std::move(flatTypes));
+  std::vector<RowVectorPtr> flat10k(
+      bm->makeRows(flatType, 10, 10000, FLAGS_dict_pct));
+
+  int64_t localPartitionWallUs;
+  PlanNodeStats localPartitionStatsFlat10K;
+  LocalPartitionWaitStats localPartitionWaitStats;
+  folly::addBenchmark(__FILE__, "localFlat10k", [&]() {
+    bm->runLocal(
+        flat10k,
+        FLAGS_width,
+        FLAGS_num_local_tasks,
+        localPartitionWallUs,
+        localPartitionStatsFlat10K,
+        localPartitionWaitStats);
+    return 1;
+  });
+
+  folly::runBenchmarks();
+
+  std::sort(
+      localPartitionWaitStats.wallMs.begin(),
+      localPartitionWaitStats.wallMs.end());
+  VELOX_CHECK(!localPartitionWaitStats.wallMs.empty());
+
+  std::cout
+      << "--------------------------------LocalFlat10K-------------------------------"
+      << std::endl;
+  std::cout << "Wall Time (ms): " << "\n Total: "
+            << succinctMicros(localPartitionWallUs)
+            << "\n Max: " << localPartitionWaitStats.wallMs.back()
+            << "\n Median: "
+            << localPartitionWaitStats
+                   .wallMs[localPartitionWaitStats.wallMs.size() / 2]
+            << "\n Min: " << localPartitionWaitStats.wallMs.front()
+            << std::endl;
+  std::cout << "LocalPartition: " << localPartitionStatsFlat10K.toString()
+            << std::endl;
+  sortByAndPrintMax(
+      "Producer Wait Time (ms)",
+      localPartitionWaitStats.totalProducerWaitMs,
+      localPartitionWaitStats.producerWaitMs);
+  sortByAndPrintMax(
+      "Consumer Wait Time (ms)",
+      localPartitionWaitStats.totalConsumerWaitMs,
+      localPartitionWaitStats.consumerWaitMs);
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  functions::prestosql::registerAllScalarFunctions();
+  aggregate::prestosql::registerAllAggregateFunctions();
+  parse::registerTypeResolver();
+
+  bm = std::make_unique<LocalExchangeBenchmark>();
+  runBenchmarks();
+  bm.reset();
+
+  return 0;
+}

From 627bf5de6323f5f553d9498bab05cd043ac5a64c Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Wed, 1 Apr 2026 16:15:29 +0100
Subject: [PATCH 13/24] feat(PartitionedOutput): Add constant support in
 PrestoIterativePartitioningSerializer

---
 .../PrestoIterativePartitioningSerializer.cpp | 161 +++++++++++++---
 .../PrestoIterativePartitioningSerializer.h   |  17 ++
 ...erativePartitioningSerializerBenchmark.cpp | 176 ++++++++++++++----
 ...stoIterativePartitioningSerializerTest.cpp | 151 +++++++++++++++
 velox/vector/PartitionedVector.cpp            |  21 ++-
 5 files changed, 462 insertions(+), 64 deletions(-)

diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
index 88e7e7f9a5d..f1df44212c7 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.cpp
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
@@ -18,6 +18,7 @@
 #include "velox/common/base/BitUtil.h"
 #include "velox/type/Type.h"
 #include "velox/vector/ComplexVector.h"
+#include "velox/vector/ConstantVector.h"
 #include "velox/vector/FlatVector.h"
 
 namespace facebook::velox::serializer::presto {
@@ -31,6 +32,9 @@ constexpr int64_t kUncompressedSizeOffset{kVectorSizeTypeSize + 1};
 // [numRows:4][codec:1][uncompressedSize:4][compressedSize:4][checksum:8]
 constexpr int64_t kHeaderSize{kUncompressedSizeOffset + 4 + 4 + 8};
 
+// chunk size for flushing constant values
+constexpr int32_t kChunkBytes = 4096;
+
 static inline const std::string_view kByteArray{"BYTE_ARRAY"};
 static inline const std::string_view kShortArray{"SHORT_ARRAY"};
 static inline const std::string_view kIntArray{"INT_ARRAY"};
@@ -484,11 +488,13 @@ void PrestoIterativePartitioningSerializer::flushColumn(
     case TypeKind::ROW:
     case TypeKind::ARRAY:
     case TypeKind::MAP:
-      VELOX_NYI();
+      VELOX_NYI(
+          "Unsupported vector type kind for PrestoIterativePartitioningSerializer: {}",
+          typeKind);
 
     default:
       VELOX_UNSUPPORTED(
-          "Invalid vector encoding for PrestoIterativePartitioningSerializer: ",
+          "Invalid vector type kind for PrestoIterativePartitioningSerializer: {}",
           typeKind);
   }
 }
@@ -565,6 +571,59 @@ void PrestoIterativePartitioningSerializer::flushSingleFlatVector<
   }
 }
 
+template <TypeKind kind>
+void PrestoIterativePartitioningSerializer::flushSingleConstantVector(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  if constexpr (
+      kind == TypeKind::VARCHAR || kind == TypeKind::VARBINARY ||
+      kind == TypeKind::TIMESTAMP) {
+    VELOX_NYI(
+        "flushSingleConstantVector does not support variable-length type: {}",
+        kind);
+  }
+
+  using T = typename TypeTraits<kind>::NativeType;
+  auto* constantVector =
+      partitionedVector->baseVector()->template as<ConstantVector<T>>();
+  VELOX_DCHECK_NOT_NULL(constantVector);
+
+  if (constantVector->isNullAt(0)) {
+    return;
+  }
+
+  const auto value = constantVector->valueAtFast(0);
+  const auto* partitionOffsets = partitionedVector->rawPartitionOffsets();
+
+  Scratch scratch;
+  ScratchPtr<T> values(scratch);
+  const auto numRowsPerChunk =
+      std::max<vector_size_t>(1, kChunkBytes / sizeof(T));
+  const char* chunkBytes = nullptr;
+
+  vector_size_t lastOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const auto offset = partitionOffsets[p];
+    auto numRows = offset - lastOffset;
+    if (numRows > 0) {
+      VELOX_DCHECK_NOT_NULL(outputStreams[p]);
+
+      if (chunkBytes == nullptr) {
+        auto* ptr = values.get(numRowsPerChunk);
+        std::fill_n(ptr, numRowsPerChunk, value);
+        chunkBytes = reinterpret_cast<const char*>(ptr);
+      }
+
+      while (numRows > 0) {
+        auto n = std::min<vector_size_t>(numRowsPerChunk, numRows);
+        outputStreams[p]->write(chunkBytes, n * sizeof(T));
+        numRows -= n;
+      }
+    }
+    lastOffset = offset;
+  }
+}
+
 void PrestoIterativePartitioningSerializer::flushSingleSimpleVector(
     const PartitionedVectorPtr& partitionedVector,
     const std::vector<IOBufOutputStream*>& outputStreams) const {
@@ -576,16 +635,22 @@ void PrestoIterativePartitioningSerializer::flushSingleSimpleVector(
       VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
           flushSingleFlatVector, typeKind, partitionedVector, outputStreams);
       break;
-    case VectorEncoding::Simple::BIASED:
     case VectorEncoding::Simple::CONSTANT:
+      VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+          flushSingleConstantVector,
+          typeKind,
+          partitionedVector,
+          outputStreams);
+      break;
+    case VectorEncoding::Simple::BIASED:
     case VectorEncoding::Simple::DICTIONARY:
     case VectorEncoding::Simple::SEQUENCE:
       VELOX_NYI(
-          "Unsupported vector encoding for PrestoIterativePartitioningSerializer: ",
+          "Unsupported vector encoding for PrestoIterativePartitioningSerializer: {}",
           encoding);
     default:
       VELOX_UNSUPPORTED(
-          "Invalid vector encoding for PrestoIterativePartitioningSerializer:flushSingleSimpleVector ",
+          "Invalid vector encoding for PrestoIterativePartitioningSerializer:flushSingleSimpleVector: {}",
           encoding);
   }
 }
@@ -647,24 +712,25 @@ void PrestoIterativePartitioningSerializer::flushNulls(
 
   std::vector<vector_size_t> destBitOffsets(numPartitions_, 0);
   for (const auto& pv : partitionedVectors) {
-    const uint64_t* rawNulls = pv->baseVector()->rawNulls();
-    const auto* partitionOffsets = pv->rawPartitionOffsets();
-
-    vector_size_t startBit = 0;
-    for (uint32_t p : nonEmptyPartitions) {
-      const vector_size_t numBits = partitionOffsets[p] - startBit;
-      if (rawNulls && numBits > 0 && !bitmaps[p].empty()) {
-        bits::copyBits(
-            rawNulls,
-            startBit,
-            reinterpret_cast<uint64_t*>(bitmaps[p].data()),
-            destBitOffsets[p],
-            numBits);
-      }
-      if (!bitmaps[p].empty()) {
-        destBitOffsets[p] += numBits;
-      }
-      startBit = partitionOffsets[p];
+    auto encoding = pv->baseVector()->encoding();
+    switch (encoding) {
+      case VectorEncoding::Simple::FLAT:
+        flushSimpleVectorNulls(pv, nonEmptyPartitions, bitmaps, destBitOffsets);
+        break;
+      case VectorEncoding::Simple::CONSTANT:
+        flushConstantVectorNulls(
+            pv, nonEmptyPartitions, bitmaps, destBitOffsets);
+        break;
+      case VectorEncoding::Simple::BIASED:
+      case VectorEncoding::Simple::DICTIONARY:
+      case VectorEncoding::Simple::SEQUENCE:
+        VELOX_NYI(
+            "Unsupported vector encoding for PrestoIterativePartitioningSerializer: {}",
+            encoding);
+      default:
+        VELOX_UNSUPPORTED(
+            "Invalid vector encoding for PrestoIterativePartitioningSerializer: {}",
+            encoding);
     }
   }
 
@@ -686,6 +752,55 @@ void PrestoIterativePartitioningSerializer::flushNulls(
   }
 }
 
+void PrestoIterativePartitioningSerializer::flushSimpleVectorNulls(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    std::vector<std::vector<uint8_t>>& bitmaps,
+    std::vector<vector_size_t>& destBitOffsets) {
+  const uint64_t* rawNulls = partitionedVector->baseVector()->rawNulls();
+  const auto* rawPartitionOffsets = partitionedVector->rawPartitionOffsets();
+  vector_size_t startBit = 0;
+  for (uint32_t p : nonEmptyPartitions) {
+    vector_size_t numBits = rawPartitionOffsets[p] - startBit;
+    if (rawNulls && numBits > 0 && !bitmaps[p].empty()) {
+      bits::copyBits(
+          rawNulls,
+          startBit,
+          reinterpret_cast<uint64_t*>(bitmaps[p].data()),
+          destBitOffsets[p],
+          numBits);
+    }
+    if (!bitmaps[p].empty()) {
+      destBitOffsets[p] += numBits;
+    }
+    startBit = rawPartitionOffsets[p];
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushConstantVectorNulls(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    std::vector<std::vector<uint8_t>>& bitmaps,
+    std::vector<vector_size_t>& destBitOffsets) {
+  const bool isNullConstant = partitionedVector->baseVector()->isNullAt(0);
+  const auto* rawPartitionOffsets = partitionedVector->rawPartitionOffsets();
+  vector_size_t startBit = 0;
+  for (uint32_t p : nonEmptyPartitions) {
+    vector_size_t numBits = rawPartitionOffsets[p] - startBit;
+    if (isNullConstant && numBits > 0 && !bitmaps[p].empty()) {
+      bits::fillBits(
+          reinterpret_cast<uint64_t*>(bitmaps[p].data()),
+          destBitOffsets[p],
+          destBitOffsets[p] + numBits,
+          bits::kNull);
+    }
+    if (!bitmaps[p].empty()) {
+      destBitOffsets[p] += numBits;
+    }
+    startBit = rawPartitionOffsets[p];
+  }
+}
+
 template <typename T>
 void PrestoIterativePartitioningSerializer::flushFlatValues(
     const T* partitionedValues,
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h
index b9e41286ea6..89c79935e74 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.h
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.h
@@ -116,6 +116,11 @@ class PrestoIterativePartitioningSerializer {
       const PartitionedVectorPtr& partitionedVector,
       const std::vector<IOBufOutputStream*>& outputStreams) const;
 
+  template <TypeKind kind>
+  void flushSingleConstantVector(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
   void flushHeader(
       std::string_view name,
       const std::vector<uint32_t>& nonEmptyPartitions,
@@ -130,6 +135,18 @@ class PrestoIterativePartitioningSerializer {
       const std::vector<uint32_t>& nonEmptyPartitions,
       const std::vector<IOBufOutputStream*>& outputStreams) const;
 
+  static void flushSimpleVectorNulls(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      std::vector<std::vector<uint8_t>>& bitmaps,
+      std::vector<vector_size_t>& destBitOffsets);
+
+  static void flushConstantVectorNulls(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      std::vector<std::vector<uint8_t>>& bitmaps,
+      std::vector<vector_size_t>& destBitOffsets);
+
   template <typename T>
   void flushFlatValues(
       const T* partitionedValues,
diff --git a/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
index 3244281a5dc..ec6330f42ed 100644
--- a/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
+++ b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
@@ -32,7 +32,7 @@ class PrestoIterativePartitioningSerializerBenchmark
   /// Creates a flat vector of type T with deterministic null pattern.
   /// Rows where (row % 100) < nullPct are null.
   template <typename T>
-  VectorPtr makeColumnOfType(vector_size_t size, int32_t nullPct) {
+  VectorPtr makeFlatColumnOfType(vector_size_t size, int32_t nullPct) {
     if (nullPct == 0) {
       return makeFlatVector<T>(
           size, [](auto row) { return static_cast<T>(row); });
@@ -44,16 +44,37 @@ class PrestoIterativePartitioningSerializerBenchmark
   }
 
   /// Creates a flat vector of the given TypeKind with the given null ratio.
-  VectorPtr makeColumn(vector_size_t size, TypeKind colKind, int32_t nullPct) {
+  VectorPtr
+  makeFlatColumn(vector_size_t size, TypeKind colKind, int32_t nullPct) {
     switch (colKind) {
       case TypeKind::BOOLEAN:
-        return makeColumnOfType<bool>(size, nullPct);
+        return makeFlatColumnOfType<bool>(size, nullPct);
       case TypeKind::INTEGER:
-        return makeColumnOfType<int32_t>(size, nullPct);
+        return makeFlatColumnOfType<int32_t>(size, nullPct);
       case TypeKind::BIGINT:
-        return makeColumnOfType<int64_t>(size, nullPct);
+        return makeFlatColumnOfType<int64_t>(size, nullPct);
       case TypeKind::HUGEINT:
-        return makeColumnOfType<int128_t>(size, nullPct);
+        return makeFlatColumnOfType<int128_t>(size, nullPct);
+      default:
+        VELOX_UNSUPPORTED(
+            "Unsupported TypeKind: {}", TypeKindName::toName(colKind));
+    }
+  }
+
+  VectorPtr
+  makeConstantColumn(vector_size_t size, TypeKind colKind, bool nullConstant) {
+    if (nullConstant) {
+      return makeNullConstant(colKind, size);
+    }
+    switch (colKind) {
+      case TypeKind::BOOLEAN:
+        return makeConstant<bool>(true, size);
+      case TypeKind::INTEGER:
+        return makeConstant<int32_t>(42, size);
+      case TypeKind::BIGINT:
+        return makeConstant<int64_t>(1000, size);
+      case TypeKind::HUGEINT:
+        return makeConstant<int128_t>(10000, size);
       default:
         VELOX_UNSUPPORTED(
             "Unsupported TypeKind: {}", TypeKindName::toName(colKind));
@@ -63,16 +84,33 @@ class PrestoIterativePartitioningSerializerBenchmark
   /// Creates a RowVector with numCols columns of the given TypeKind.
   RowVectorPtr makeInput(
       vector_size_t size,
+      VectorEncoding::Simple encoding,
       TypeKind colKind,
       uint32_t numCols,
-      int32_t nullPct) {
+      int32_t nullPct,
+      bool nullConstant = false) {
     std::vector<std::string> names;
     std::vector<VectorPtr> children;
     names.reserve(numCols);
     children.reserve(numCols);
     for (uint32_t i = 0; i < numCols; ++i) {
       names.push_back(fmt::format("c{}", i));
-      children.push_back(makeColumn(size, colKind, nullPct));
+    }
+    switch (encoding) {
+      case VectorEncoding::Simple::FLAT: {
+        for (uint32_t i = 0; i < numCols; ++i) {
+          children.push_back(makeFlatColumn(size, colKind, nullPct));
+        }
+        break;
+      }
+      case VectorEncoding::Simple::CONSTANT: {
+        for (uint32_t i = 0; i < numCols; ++i) {
+          children.push_back(makeConstantColumn(size, colKind, nullConstant));
+        }
+        break;
+      }
+      default:
+        VELOX_UNSUPPORTED("Unsupported encoding: {}", encoding);
     }
     return makeRowVector(names, children);
   }
@@ -98,20 +136,23 @@ class PrestoIterativePartitioningSerializerBenchmark
 
 } // namespace
 
-/// Single benchmark function parameterized by (colKind, numCols, nullPct,
-/// numPartitions). Registered via BENCHMARK_NAMED_PARAM below.
+/// Single benchmark function parameterized by (encoding, colKind, numCols,
+/// nullPct, nullConstant, numPartitions). Registered via BENCHMARK_NAMED_PARAM
+/// below.
 ///
 /// All runs use 10'000 rows. Setup (input creation, serializer construction,
 /// append) is excluded from the measured time.
 void benchmarkFlush(
-    uint32_t /* iters */,
+    VectorEncoding::Simple encoding,
     TypeKind colKind,
     uint32_t numCols,
     int32_t nullPct,
+    bool nullConstant,
     uint32_t numPartitions) {
   folly::BenchmarkSuspender suspender;
   PrestoIterativePartitioningSerializerBenchmark benchmark;
-  auto input = benchmark.makeInput(10'000, colKind, numCols, nullPct);
+  auto input = benchmark.makeInput(
+      10'000, encoding, colKind, numCols, nullPct, nullConstant);
   auto parts = benchmark.makePartitions(10'000, numPartitions);
   auto serializer = benchmark.makeSerializer(
       std::static_pointer_cast<const RowType>(input->type()), numPartitions);
@@ -126,6 +167,36 @@ void benchmarkFlush(
   folly::doNotOptimizeAway(result);
 }
 
+void benchmarkFlushFlat(
+    uint32_t /* iters */,
+    TypeKind colKind,
+    uint32_t numCols,
+    int32_t nullPct,
+    uint32_t numPartitions) {
+  benchmarkFlush(
+      VectorEncoding::Simple::FLAT,
+      colKind,
+      numCols,
+      nullPct,
+      false,
+      numPartitions);
+}
+
+void benchmarkFlushConstant(
+    uint32_t /* iters */,
+    TypeKind colKind,
+    uint32_t numCols,
+    bool nullConstant,
+    uint32_t numPartitions) {
+  benchmarkFlush(
+      VectorEncoding::Simple::CONSTANT,
+      colKind,
+      numCols,
+      0,
+      nullConstant,
+      numPartitions);
+}
+
 // clang-format off
 // Dimensions:
 //   col type:       {bool, int, bigint, hugeint}
@@ -133,34 +204,63 @@ void benchmarkFlush(
 //   null pct:       {0, 25, 50, 75, 100}
 //   num partitions: {1, 4, 16, 64, 256, 1024}
 //
-// Naming: flush_<type>_<N>cols_<P>pct_<K>parts
-
-#define FLUSH_PARAM(type_name, kind, num_cols, null_pct, num_parts) \
-  BENCHMARK_NAMED_PARAM(                                            \
-      benchmarkFlush,                                               \
-      type_name## _## num_cols## cols_## null_pct## pct_## num_parts## parts, \
-      TypeKind::kind, num_cols, null_pct, num_parts)
-
-#define FLUSH_FOR_PARTS(type_name, kind, num_cols, null_pct) \
-  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 1)        \
-  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 4)        \
-  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 16)       \
-  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 64)       \
-  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 256)      \
-  FLUSH_PARAM(type_name, kind, num_cols, null_pct, 1024)
-
-#define FLUSH_FOR_NULLS(type_name, kind, num_cols) \
-  FLUSH_FOR_PARTS(type_name, kind, num_cols, 0)    \
-  FLUSH_FOR_PARTS(type_name, kind, num_cols, 25)   \
-  FLUSH_FOR_PARTS(type_name, kind, num_cols, 50)   \
-  FLUSH_FOR_PARTS(type_name, kind, num_cols, 75)   \
-  FLUSH_FOR_PARTS(type_name, kind, num_cols, 100)
+// Naming: flat_<type>_<N>cols_<P>pct_<K>parts
+#define FLUSH_FLAT_PARAM(type_name, kind, num_cols, null_pct, num_parts)      \
+  BENCHMARK_NAMED_PARAM(                                                      \
+      benchmarkFlushFlat,                                                     \
+      type_name##_##num_cols##cols_##null_pct##pct_##num_parts##parts, \
+      TypeKind::kind,                                                         \
+      num_cols,                                                               \
+      null_pct,                                                               \
+      num_parts)
+
+// Dimensions:
+//   col type:       {bool, int, bigint, hugeint}
+//   num cols:       {1, 4, 16, 64}
+//   null constant:  {false, true}
+//   num partitions: {1, 4, 16, 64, 256, 1024}
+//
+// Naming: constant_<type>_<N>cols_[non_]null_<K>parts
+#define FLUSH_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)           \
+  BENCHMARK_NAMED_PARAM(                                                     \
+      benchmarkFlushConstant,                                                \
+      type_name##_##num_cols##cols_##notnull_##num_parts##parts,             \
+      TypeKind::kind,                                                        \
+      num_cols,                                                              \
+      false,                                                                 \
+      num_parts)
+
+#define FLUSH_NULL_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)  \
+  BENCHMARK_NAMED_PARAM(                                                 \
+      benchmarkFlushConstant,                                            \
+      type_name##_##num_cols##cols_##null_##num_parts##parts,            \
+      TypeKind::kind,                                                    \
+      num_cols,                                                          \
+      true,                                                              \
+      num_parts)
+
+#define FLUSH_FOR_NULLS(type_name, kind, num_cols, num_parts) \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 0, num_parts)   \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 25, num_parts)  \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 50, num_parts)  \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 75, num_parts)  \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 100, num_parts) \
+  FLUSH_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)  \
+  FLUSH_NULL_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)
+
+#define FLUSH_FOR_PARTS(type_name, kind, num_cols) \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 1)    \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 4)    \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 16)   \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 64)   \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 256)  \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 1024)
 
 #define FLUSH_FOR_COLS(type_name, kind) \
-  FLUSH_FOR_NULLS(type_name, kind, 1)   \
-  FLUSH_FOR_NULLS(type_name, kind, 4)   \
-  FLUSH_FOR_NULLS(type_name, kind, 16)  \
-  FLUSH_FOR_NULLS(type_name, kind, 64)
+  FLUSH_FOR_PARTS(type_name, kind, 1)   \
+  FLUSH_FOR_PARTS(type_name, kind, 4)   \
+  FLUSH_FOR_PARTS(type_name, kind, 16)  \
+  FLUSH_FOR_PARTS(type_name, kind, 64)
 
 FLUSH_FOR_COLS(bool, BOOLEAN)
 FLUSH_FOR_COLS(int, INTEGER)
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
index e315684d811..79f20f1d886 100644
--- a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -87,6 +87,19 @@ class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase {
   PrestoVectorSerde serde_;
 };
 
+template <>
+std::vector<bool> PrestoIterativePartitioningSerializerTestBase::sortedValues<
+    bool>(const RowVectorPtr& row, int column) {
+  auto* flat = row->childAt(column)->as<FlatVector<bool>>();
+  std::vector<bool> vals;
+  vals.reserve(row->size());
+  for (int i = 0; i < row->size(); ++i) {
+    vals.push_back(flat->valueAtFast(i));
+  }
+  std::sort(vals.begin(), vals.end());
+  return vals;
+}
+
 // ---------------------------------------------------------------------------
 // Value-parameterized fixture — routing, null-handling over scalar TypePtrs.
 // Uses BaseVector::create() + setNull() so no C++ type dispatch is needed.
@@ -489,6 +502,144 @@ TEST_F(PrestoIterativePartitioningSerializerTest, multipleCycles) {
   }
 }
 
+// ── Encoding
+// ─────────────────────────────────────────────────────────────────
+
+// Constant vectors are flattened across append() calls.
+TEST_F(PrestoIterativePartitioningSerializerTest, constantColumnAcrossAppends) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 3);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(11, 4)}), {0, 1, 0, 2});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(22, 5)}), {2, 0, 1, 1, 2});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 3);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  auto r2 = deserialize(*ioBufs.at(2).first, type);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{11, 11, 22}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{11, 22, 22}));
+  EXPECT_EQ(sortedValues<int64_t>(r2, 0), (std::vector<int64_t>{11, 22, 22}));
+}
+
+// Boolean constant vectors are flattened across append() calls.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    booleanConstantColumnAcrossAppends) {
+  auto type = ROW({"v"}, {BOOLEAN()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<bool>(true, 4)}), {0, 1, 0, 1});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<bool>(false, 3)}), {1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  EXPECT_EQ(sortedValues<bool>(r0, 0), (std::vector<bool>{false, true, true}));
+  EXPECT_EQ(
+      sortedValues<bool>(r1, 0), (std::vector<bool>{false, false, true, true}));
+}
+
+// Null constant vectors contribute only nulls but still advance row positions.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    nullConstantColumnAcrossAppends) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(std::nullopt, 3)}),
+      {0, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(7, 3)}), {1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  auto actual0 = nullableValues<int64_t>(r0, 0);
+  std::sort(actual0.begin(), actual0.end());
+  auto expected0 =
+      std::vector<std::optional<int64_t>>{std::nullopt, std::nullopt, 7};
+  EXPECT_EQ(actual0, expected0);
+
+  auto actual1 = nullableValues<int64_t>(r1, 0);
+  std::sort(actual1.begin(), actual1.end());
+  auto expected1 = std::vector<std::optional<int64_t>>{std::nullopt, 7, 7};
+  EXPECT_EQ(actual1, expected1);
+}
+
+// Constant and flat vectors are flattened and serialized correctly across
+// append() calls.
+TEST_F(PrestoIterativePartitioningSerializerTest, mixedConstantFlatVector) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(7, 3)}), {0, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3})}), {1, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(8, 2)}), {0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{3, 7, 7, 8}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{1, 2, 7, 8}));
+}
+
+// Null constant rows are preserved and serialized correctly with flat and
+// nullable flat vectors across append() calls.
+TEST_F(PrestoIterativePartitioningSerializerTest, mixedNullConstantFlatVector) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4})}),
+      {0, 1, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(std::nullopt, 3)}),
+      {0, 1, 0});
+  serializer->append(
+      makeRowVector(
+          {"v"}, {makeNullableFlatVector<int64_t>({std::nullopt, 7, 3})}),
+      {1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  auto actual0 = nullableValues<int64_t>(r0, 0);
+  std::sort(actual0.begin(), actual0.end());
+  auto expected0 =
+      std::vector<std::optional<int64_t>>{std::nullopt, std::nullopt, 1, 4, 7};
+  EXPECT_EQ(actual0, expected0);
+
+  auto actual1 = nullableValues<int64_t>(r1, 0);
+  std::sort(actual1.begin(), actual1.end());
+  auto expected1 =
+      std::vector<std::optional<int64_t>>{std::nullopt, std::nullopt, 2, 3, 3};
+  EXPECT_EQ(actual1, expected1);
+}
+
 // ── Scale and regression
 // ───────────────────────────────────────────────────────
 
diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
index bc83840aa9c..233c932fee9 100644
--- a/velox/vector/PartitionedVector.cpp
+++ b/velox/vector/PartitionedVector.cpp
@@ -308,8 +308,11 @@ PartitionedVectorPtr PartitionedVector::create(
     }
 
     case VectorEncoding::Simple::CONSTANT: {
-      return std::make_shared<PartitionedConstantVector>(
-          vector, numPartitions, endPartitionOffsets, pool);
+      auto partitionedConstantVector =
+          std::make_shared<PartitionedConstantVector>(
+              vector, numPartitions, endPartitionOffsets, pool);
+      partitionedConstantVector->partition(partitions, ctx);
+      return partitionedConstantVector;
     }
 
     case VectorEncoding::Simple::ARRAY:
@@ -461,7 +464,19 @@ VectorPtr PartitionedRowVector::partitionAt(uint32_t partition) const {
 
 void PartitionedConstantVector::partition(
     const std::vector<uint32_t>& /*partitions*/,
-    PartitionBuildContext& /*ctx*/) {}
+    PartitionBuildContext& /*ctx*/) {
+  if (!vector_->isNullAt(0)) {
+    return;
+  }
+
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1];
+    const vector_size_t end = rawEndPartitionOffsets_[p];
+    if (begin < end) {
+      numNullsPerPartition_[p] = end - begin;
+    }
+  }
+}
 
 VectorPtr PartitionedConstantVector::partitionAt(uint32_t partition) const {
   VELOX_CHECK_LT(partition, numPartitions_);

From 12f84ef20e85845f05ac4dce19c510236611bebe Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Tue, 21 Apr 2026 08:45:40 -0700
Subject: [PATCH 14/24] fix(PartitionedOutput): wire optimized partitioned
 output to OutputBufferManager listeners

Pass an OutputBufferManager-backed listener factory into
PrestoIterativePartitioningSerializer so the optimized path uses the same
listener source as normal PartitionedOutput. Create per-partition listeners
during flush, set the checksum bit only when a listener is present, and
compute the page checksum only for PrestoOutputStreamListener instances.
Also add tests that verify checksum headers are written and that the
serialized pages round-trip through the standard deserializer.
---
 velox/exec/OptimizedPartitionedOutput.cpp     | 12 +++-
 .../PrestoIterativePartitioningSerializer.cpp | 58 +++++++++++-----
 .../PrestoIterativePartitioningSerializer.h   | 15 +++-
 ...stoIterativePartitioningSerializerTest.cpp | 69 +++++++++++++++++++
 4 files changed, 132 insertions(+), 22 deletions(-)

diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp
index bad3ea49378..e825ea94633 100644
--- a/velox/exec/OptimizedPartitionedOutput.cpp
+++ b/velox/exec/OptimizedPartitionedOutput.cpp
@@ -69,7 +69,17 @@ OptimizedPartitionedOutput::OptimizedPartitionedOutput(
 
   serializer_ = std::make_unique<
       serializer::presto::PrestoIterativePartitioningSerializer>(
-      inputType_, numDestinations_, options, pool_);
+      inputType_,
+      numDestinations_,
+      options,
+      pool_,
+      [bufferManager =
+           bufferManager_]() -> std::unique_ptr<OutputStreamListener> {
+        auto lockedBufferManager = bufferManager.lock();
+        VELOX_CHECK_NOT_NULL(
+            lockedBufferManager, "OutputBufferManager was already destructed");
+        return lockedBufferManager->newListener();
+      });
 }
 
 void OptimizedPartitionedOutput::addInput(RowVectorPtr input) {
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
index f1df44212c7..20f893107f1 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.cpp
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
@@ -51,9 +51,11 @@ inline void writeInt64(OutputStream* out, int64_t value) {
   out->write(reinterpret_cast<const char*>(&value), sizeof(value));
 }
 
-char getCodecMarker() {
+char getCodecMarker(bool checksumEnabled) {
   char marker = 0;
-  marker |= kCheckSumBitMask;
+  if (checksumEnabled) {
+    marker |= kCheckSumBitMask;
+  }
   return marker;
 }
 
@@ -233,11 +235,13 @@ PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer(
     RowTypePtr inputType,
     uint32_t numPartitions,
     const SerdeOpts& opts,
-    memory::MemoryPool* pool)
+    memory::MemoryPool* pool,
+    std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory)
     : type_(std::move(inputType)),
       numPartitions_(numPartitions),
       opts_(opts),
       pool_(pool),
+      listenerFactory_(std::move(listenerFactory)),
       rowsPerPartition_(numPartitions, 0) {
   VELOX_CHECK_GT(numPartitions_, 0);
   VELOX_CHECK_NOT_NULL(pool_);
@@ -306,8 +310,6 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
     return {};
   }
 
-  const char codecMask = getCodecMarker();
-
   // 1. Determine non-empty partitions.
   std::vector<uint32_t> nonEmptyPartitions;
   for (uint32_t p = 0; p < numPartitions_; ++p) {
@@ -335,10 +337,23 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
         numPartitions_);
   }
 
-  // 3. Create output streams sized to the exact bytes each partition will need,
+  // 3. Create per-partition listeners first so the codec mask can be derived
+  // from whether the factory actually produced a listener. The factory may
+  // return nullptr (e.g. when OutputBufferManager has no listener factory
+  // set), in which case checksumming is skipped and the checksum bit must not
+  // be set in the codec byte.
+  std::vector<std::unique_ptr<OutputStreamListener>> listeners(numPartitions_);
+  for (uint32_t p : nonEmptyPartitions) {
+    if (listenerFactory_) {
+      listeners[p] = listenerFactory_();
+    }
+  }
+  const bool checksumEnabled = !nonEmptyPartitions.empty() &&
+      listeners[nonEmptyPartitions[0]] != nullptr;
+  const char codecMask = getCodecMarker(checksumEnabled);
+
+  // 4. Create output streams sized to the exact bytes each partition will need,
   // so that the entire payload fits. This avoids multiple resizing and copying.
-  std::vector<std::unique_ptr<PrestoOutputStreamListener>> listeners(
-      numPartitions_);
   std::vector<std::unique_ptr<IOBufOutputStream>> outputStreams(numPartitions_);
   std::vector<IOBufOutputStream*> rawOutputStreams(numPartitions_);
   std::vector<std::streampos> beginStreamPositions(numPartitions_);
@@ -348,7 +363,6 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
     for (uint32_t col = 0; col < rowSchema.size(); ++col) {
       initialSize += flushSizes_[col][p];
     }
-    listeners[p] = std::make_unique<PrestoOutputStreamListener>();
     outputStreams[p] = std::make_unique<IOBufOutputStream>(
         *pool_, listeners[p].get(), initialSize);
     rawOutputStreams[p] = outputStreams[p].get();
@@ -357,11 +371,11 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
     flushStart(*outputStreams[p], p, codecMask);
   }
 
-  // 4. Flush column data.
+  // 5. Flush column data.
   flushRowChildren(
       partitionedRowVectors_, rowSchema, nonEmptyPartitions, rawOutputStreams);
 
-  // 5. Finalize the page by seeking back to fill in sizes and CRC, and get the
+  // 6. Finalize the page by seeking back to fill in sizes and CRC, and get the
   // IOBuf and numOfRows from each stream.
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
       result;
@@ -371,7 +385,7 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
         p,
         beginStreamPositions[p],
         codecMask,
-        *listeners[p]);
+        listeners[p].get());
     result[p] =
         std::make_pair(outputStreams[p]->getIOBuf(), rowsPerPartition_[p]);
   }
@@ -438,17 +452,23 @@ void PrestoIterativePartitioningSerializer::flushFinish(
     uint32_t partition,
     std::streampos beginOffset,
     char codecMask,
-    PrestoOutputStreamListener& listener) const {
-  listener.pause();
+    OutputStreamListener* listener) const {
+  auto* prestoListener = dynamic_cast<PrestoOutputStreamListener*>(listener);
+  if (prestoListener) {
+    prestoListener->pause();
+  }
 
   const std::streampos totalSize =
       static_cast<int32_t>(out.tellp() - beginOffset);
   const std::streampos uncompressedSize = totalSize - kHeaderSize;
-  const int64_t crc = computeChecksum(
-      listener,
-      static_cast<int8_t>(codecMask),
-      static_cast<int32_t>(rowsPerPartition_[partition]),
-      uncompressedSize);
+  int64_t crc = 0;
+  if (prestoListener) {
+    crc = computeChecksum(
+        *prestoListener,
+        static_cast<int8_t>(codecMask),
+        static_cast<int32_t>(rowsPerPartition_[partition]),
+        uncompressedSize);
+  }
 
   out.seekp(beginOffset + kUncompressedSizeOffset);
   writeInt32(&out, uncompressedSize);
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h
index 89c79935e74..f0ea802fe9e 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.h
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.h
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <functional>
 #include <map>
 #include <memory>
 #include <vector>
@@ -38,11 +39,20 @@ using SerdeOpts = PrestoVectorSerde::PrestoOptions;
 /// internal state so the serializer can be reused for the next cycle.
 class PrestoIterativePartitioningSerializer {
  public:
+  /// Constructs the serializer. If `listenerFactory` is non-null it is called
+  /// once per non-empty partition on each flush to create an
+  /// OutputStreamListener that accumulates the CRC32 checksum; the checksum
+  /// bit is then set in the Presto page codec byte and the computed value is
+  /// written into the page header. Pass nullptr to skip checksum computation,
+  /// which matches the behavior of kNormal PartitionedOutput when
+  /// OutputBufferManager has no listener factory set.
   PrestoIterativePartitioningSerializer(
       RowTypePtr inputType,
       uint32_t numPartitions,
       const SerdeOpts& opts,
-      memory::MemoryPool* pool);
+      memory::MemoryPool* pool,
+      std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory =
+          nullptr);
 
   /// Routes each row in `input` to the partition indicated by `partitions`.
   /// `partitions.size()` must equal `input->size()`.
@@ -87,7 +97,7 @@ class PrestoIterativePartitioningSerializer {
       uint32_t partition,
       std::streampos beginOffset,
       char codecMask,
-      PrestoOutputStreamListener& listener) const;
+      OutputStreamListener* listener) const;
 
   void flushRowChildren(
       const std::vector<PartitionedVectorPtr>& partitionedVectors,
@@ -162,6 +172,7 @@ class PrestoIterativePartitioningSerializer {
   uint32_t numPartitions_;
   SerdeOpts opts_;
   memory::MemoryPool* pool_;
+  std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory_;
 
   /// Cumulative row count per partition across all appended batches.
   std::vector<vector_size_t> rowsPerPartition_;
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
index 79f20f1d886..ea76dc11ab6 100644
--- a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -84,6 +84,42 @@ class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase {
         type, numPartitions, opts, pool_.get());
   }
 
+  /// Builds a serializer that computes a CRC32 checksum on each flush via a
+  /// PrestoOutputStreamListener factory, matching the kOptimized path when
+  /// OutputBufferManager has a listener factory set.
+  std::unique_ptr<PrestoIterativePartitioningSerializer>
+  makeSerializerWithListener(const RowTypePtr& type, uint32_t numPartitions) {
+    SerdeOpts opts;
+    return std::make_unique<PrestoIterativePartitioningSerializer>(
+        type,
+        numPartitions,
+        opts,
+        pool_.get(),
+        []() -> std::unique_ptr<OutputStreamListener> {
+          return std::make_unique<PrestoOutputStreamListener>();
+        });
+  }
+
+  // Presto page header layout: [numRows:4][codec:1][uncompressedSize:4]
+  //                             [compressedSize:4][checksum:8]
+  static constexpr int kCodecByteOffset = 4;
+  static constexpr int kChecksumOffset = 13;
+  static constexpr int8_t kChecksumBitMask = 4;
+
+  /// Returns the codec byte from the Presto page header in `iobuf`.
+  static int8_t codecByte(const folly::IOBuf& iobuf) {
+    VELOX_CHECK_GE(iobuf.length(), kChecksumOffset + 8);
+    return reinterpret_cast<const int8_t*>(iobuf.data())[kCodecByteOffset];
+  }
+
+  /// Returns the 8-byte checksum field from the Presto page header in `iobuf`.
+  static int64_t checksumField(const folly::IOBuf& iobuf) {
+    VELOX_CHECK_GE(iobuf.length(), kChecksumOffset + 8);
+    int64_t value;
+    std::memcpy(&value, iobuf.data() + kChecksumOffset, sizeof(value));
+    return value;
+  }
+
   PrestoVectorSerde serde_;
 };
 
@@ -750,6 +786,39 @@ TEST_F(
   }
 }
 
+// ── Checksum (CRC32)
+// ──────────────────────────────────────────────────────
+
+// Verify the checksum bit is set and a non-zero checksum is written when a
+// PrestoOutputStreamListener factory is provided, and that the standard
+// deserializer (which validates the checksum) accepts the page.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, checksumRoundTrip) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 6, pool_.get());
+  col->setNull(1, true);
+  col->setNull(4, true);
+
+  auto serializer = makeSerializerWithListener(type, 2);
+  serializer->append(makeRowVector({"a"}, {col}), {0, 1, 0, 1, 0, 1});
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  for (auto& [partition, pageData] : ioBufs) {
+    auto& iobuf = *pageData.first;
+    EXPECT_NE(codecByte(iobuf) & kChecksumBitMask, 0)
+        << "checksum bit must be set in codec byte";
+    EXPECT_NE(checksumField(iobuf), 0) << "checksum field must be non-zero";
+    // Deserializer validates the checksum internally; throws if wrong.
+    auto result = deserialize(iobuf, type);
+    EXPECT_GT(result->size(), 0);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Non-typed fixture (TEST_F) — lifecycle, structural, regression
+// ---------------------------------------------------------------------------
+
 // Regression: flushNulls previously wrote null bitmaps by obtaining a raw
 // pointer via writePosition() then advancing the stream via seekp(). This
 // assumed the pre-allocated IOBufOutputStream had a single contiguous range,

From ee8a1aa1bb2459e1eb07853495bc882c2d38fa8b Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Sun, 12 Apr 2026 07:40:10 -0700
Subject: [PATCH 15/24] refactor(PartitionedOutput): rework ExchangeBenchmark
 inputs and reporting

- add explicit simple-schema benchmark cases by type and column count
- register normal and optimized runs as separate named benchmark cases
- make `dictPct` apply per generated vector and recurse into nested types
- generate benchmark input vectors directly with optional nulls
- replace ad hoc flat input generation with explicit input specs
- return `ExchangeRunStats` from benchmark runs and centralize query config
- group printed results by dataset with normal vs. optimized stats
---
 velox/exec/benchmarks/ExchangeBenchmark.cpp | 686 +++++++++++++++-----
 1 file changed, 517 insertions(+), 169 deletions(-)

diff --git a/velox/exec/benchmarks/ExchangeBenchmark.cpp b/velox/exec/benchmarks/ExchangeBenchmark.cpp
index 3301b9e6e8d..d204f4ed666 100644
--- a/velox/exec/benchmarks/ExchangeBenchmark.cpp
+++ b/velox/exec/benchmarks/ExchangeBenchmark.cpp
@@ -17,7 +17,6 @@
 #include <folly/init/Init.h>
 
 #include "velox/core/QueryConfig.h"
-#include "velox/dwio/common/tests/utils/BatchMaker.h"
 #include "velox/exec/Exchange.h"
 #include "velox/exec/PlanNodeStats.h"
 #include "velox/exec/tests/utils/AssertQueryBuilder.h"
@@ -32,9 +31,13 @@
 DEFINE_int32(width, 16, "Number of parties in shuffle");
 DEFINE_int32(task_width, 4, "Number of threads in each task in shuffle");
 
-DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch.");
 DEFINE_int64(exchange_buffer_mb, 32, "task-wide buffer in remote exchange");
-DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary");
+DEFINE_int32(
+    dict_pct,
+    0,
+    "Percentage of vectors per column wrapped in dictionary encoding. "
+    "Applied independently to each column across all generated row vectors "
+    "and recursively to nested children.");
 // Add the following definitions to allow Clion runs
 DEFINE_bool(gtest_color, false, "");
 DEFINE_string(gtest_filter, "*", "");
@@ -53,60 +56,401 @@ using namespace facebook::velox::test;
 
 namespace {
 
+bool shouldWrapVector(
+    int32_t vectorIndex,
+    int32_t numVectors,
+    int32_t dictPct) {
+  VELOX_CHECK_GE(dictPct, 0);
+  VELOX_CHECK_LE(dictPct, 100);
+  return dictPct > 0 && (vectorIndex * 100) / numVectors < dictPct;
+}
+
+void wrapDictionaryRecursive(VectorPtr& vector) {
+  if (!vector) {
+    return;
+  }
+
+  switch (vector->encoding()) {
+    case VectorEncoding::Simple::ROW: {
+      auto row = vector->as<RowVector>();
+      for (auto i = 0; i < row->childrenSize(); ++i) {
+        wrapDictionaryRecursive(row->childAt(i));
+      }
+      break;
+    }
+    case VectorEncoding::Simple::ARRAY: {
+      auto array = vector->as<ArrayVector>();
+      auto elements = array->elements();
+      wrapDictionaryRecursive(elements);
+      array->setElements(std::move(elements));
+      break;
+    }
+    case VectorEncoding::Simple::MAP: {
+      auto map = vector->as<MapVector>();
+      auto keys = map->mapKeys();
+      auto values = map->mapValues();
+      wrapDictionaryRecursive(keys);
+      wrapDictionaryRecursive(values);
+      map->setKeysAndValues(std::move(keys), std::move(values));
+      break;
+    }
+    default:
+      break;
+  }
+
+  auto indices = facebook::velox::test::makeIndices(
+      vector->size(), [](auto row) { return row; }, vector->pool());
+  vector =
+      BaseVector::wrapInDictionary(nullptr, indices, vector->size(), vector);
+}
+
 struct ExchangeRunStats {
   int64_t wallUs = 0;
   PlanNodeStats partitionedOutputStats;
   PlanNodeStats exchangeStats;
 };
 
-void printExchangeStats(
-    const std::string& datasetName,
-    const std::string& modeName,
-    const ExchangeRunStats& stats) {
-  std::cout << "-----------------------------" << datasetName << " ("
-            << modeName << ")-----------------------------" << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(stats.wallUs) << std::endl;
-  std::cout << "PartitionOutput: " << stats.partitionedOutputStats.toString()
-            << std::endl;
-  std::cout << "Exchange: " << stats.exchangeStats.toString() << std::endl;
+enum class ExchangeMode {
+  kNormal,
+  kOptimized,
+};
+
+/// Column element type dimension for simple-schema exchange benchmarks.
+enum class SimpleColType {
+  kBoolean,
+  kTinyint,
+  kInteger,
+  kBigint,
+  kHugeint,
+  kLongDecimal,
+  kDouble,
+};
+
+TypePtr simpleColTypeToType(SimpleColType colType) {
+  switch (colType) {
+    case SimpleColType::kBoolean:
+      return BOOLEAN();
+    case SimpleColType::kTinyint:
+      return TINYINT();
+    case SimpleColType::kInteger:
+      return INTEGER();
+    case SimpleColType::kBigint:
+      return BIGINT();
+    case SimpleColType::kHugeint:
+      return HUGEINT();
+    case SimpleColType::kLongDecimal:
+      return DECIMAL(20, 3);
+    case SimpleColType::kDouble:
+      return DOUBLE();
+  }
+  VELOX_UNREACHABLE();
+}
+
+std::string simpleColTypeName(SimpleColType colType) {
+  switch (colType) {
+    case SimpleColType::kBoolean:
+      return "Boolean";
+    case SimpleColType::kTinyint:
+      return "Tinyint";
+    case SimpleColType::kInteger:
+      return "Integer";
+    case SimpleColType::kBigint:
+      return "Bigint";
+    case SimpleColType::kHugeint:
+      return "Hugeint";
+    case SimpleColType::kLongDecimal:
+      return "LongDecimal";
+    case SimpleColType::kDouble:
+      return "Double";
+  }
+  VELOX_UNREACHABLE();
+}
+
+enum class ExchangeInputKind {
+  kDeep10K,
+  kDeep50,
+  kStruct1K,
+};
+
+struct ExchangeInputSpec {
+  std::string name;
+  RowTypePtr type;
+  int32_t numVectors;
+  int32_t rowsPerVector;
+};
+
+struct ExchangeBenchmarkResult {
+  std::string datasetName;
+  ExchangeMode mode;
+  ExchangeRunStats stats;
+};
+
+std::vector<ExchangeBenchmarkResult> benchmarkResults;
+
+std::string modeName(ExchangeMode mode) {
+  switch (mode) {
+    case ExchangeMode::kNormal:
+      return "normal";
+    case ExchangeMode::kOptimized:
+      return "optimized";
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+/// Creates a simple row type with `numCols` columns all of type `colType`.
+RowTypePtr makeSimpleType(const TypePtr& colType, int32_t numCols) {
+  std::vector<std::string> names;
+  std::vector<TypePtr> types;
+  names.reserve(numCols);
+  types.reserve(numCols);
+  for (int32_t i = 0; i < numCols; ++i) {
+    names.push_back(fmt::format("c{}", i));
+    types.push_back(colType);
+  }
+  return ROW(std::move(names), std::move(types));
+}
+
+RowTypePtr makeStructType() {
+  return ROW(
+      {{"c0", BIGINT()},
+       {"r1",
+        ROW(
+            {{"k2", BIGINT()},
+             {"r2",
+              ROW(
+                  {{"i1", BIGINT()},
+                   {"i2", BIGINT()},
+                   {"r3",
+                    ROW(
+                        {{"s3", VARCHAR()},
+                         {"i5", INTEGER()},
+                         {"d5", DOUBLE()},
+                         {"b5", BOOLEAN()},
+                         {"a5", ARRAY(TINYINT())}})}})}})}});
+}
+
+RowTypePtr makeDeepType() {
+  return ROW(
+      {{"c0", BIGINT()},
+       {"long_array_val", ARRAY(ARRAY(BIGINT()))},
+       {"array_val", ARRAY(VARCHAR())},
+       {"struct_val", ROW({{"s_int", INTEGER()}, {"s_array", ARRAY(REAL())}})},
+       {"map_val",
+        MAP(VARCHAR(),
+            MAP(BIGINT(),
+                ROW({{"s2_int", INTEGER()}, {"s2_string", VARCHAR()}})))}});
+}
+
+ExchangeInputSpec makeInputSpec(ExchangeInputKind kind) {
+  switch (kind) {
+    case ExchangeInputKind::kDeep10K:
+      return {"Deep10K", makeDeepType(), 10, 10000};
+    case ExchangeInputKind::kDeep50:
+      return {"Deep50", makeDeepType(), 2000, 50};
+    case ExchangeInputKind::kStruct1K:
+      return {"Struct1K", makeStructType(), 100, 1000};
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+ExchangeInputSpec makeInputSpec(SimpleColType colType, int32_t numCols) {
+  return {
+      fmt::format("Simple10K_{}_col{}", simpleColTypeName(colType), numCols),
+      makeSimpleType(simpleColTypeToType(colType), numCols),
+      10,
+      10'000};
+}
+
+std::string formatStat(const ExchangeRunStats* stats, auto formatter) {
+  if (stats == nullptr) {
+    return "N/A";
+  }
+  return formatter(*stats);
+}
+
+void printAllExchangeStats() {
+  struct PairedStats {
+    const ExchangeRunStats* normal = nullptr;
+    const ExchangeRunStats* optimized = nullptr;
+  };
+
+  std::vector<std::string> datasetOrder;
+  std::unordered_map<std::string, PairedStats> groupedStats;
+  for (const auto& result : benchmarkResults) {
+    auto [it, inserted] =
+        groupedStats.try_emplace(result.datasetName, PairedStats{});
+    if (inserted) {
+      datasetOrder.push_back(result.datasetName);
+    }
+    if (result.mode == ExchangeMode::kNormal) {
+      it->second.normal = &result.stats;
+    } else {
+      it->second.optimized = &result.stats;
+    }
+  }
+
+  for (const auto& datasetName : datasetOrder) {
+    const auto statsIt = groupedStats.find(datasetName);
+    VELOX_CHECK(statsIt != groupedStats.end());
+    const auto& paired = statsIt->second;
+    std::cout << "--------------------" << datasetName << "--------------------"
+              << std::endl;
+    std::cout << "Wall Time (ms) | normal: "
+              << formatStat(
+                     paired.normal,
+                     [](const ExchangeRunStats& stats) {
+                       return succinctMicros(stats.wallUs);
+                     })
+              << " | optimized: "
+              << formatStat(
+                     paired.optimized,
+                     [](const ExchangeRunStats& stats) {
+                       return succinctMicros(stats.wallUs);
+                     })
+              << std::endl;
+    std::cout << "Normal" << std::endl
+              << " - PartitionedOutput: "
+              << formatStat(
+                     paired.normal,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.partitionedOutputStats.toString();
+                     })
+              << std::endl
+              << " - Exchange: "
+              << formatStat(
+                     paired.normal,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.exchangeStats.toString();
+                     })
+              << std::endl;
+    std::cout << "Optimized" << std::endl
+              << " - PartitionedOutput: "
+              << formatStat(
+                     paired.optimized,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.partitionedOutputStats.toString();
+                     })
+              << std::endl
+              << " - Exchange: "
+              << formatStat(
+                     paired.optimized,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.exchangeStats.toString();
+                     })
+              << std::endl;
+  }
+}
+
+template <typename Fn>
+ExchangeRunStats runBenchmarkIterations(unsigned int iters, Fn&& runOnce) {
+  ExchangeRunStats stats;
+  while (iters--) {
+    stats = runOnce();
+  }
+  return stats;
 }
 
 class ExchangeBenchmark : public VectorTestBase {
  public:
+  /// Creates a single flat column of `type` with `numRows` rows.
+  /// Approximately `nullPct` percent of rows are set to null, distributed
+  /// uniformly (row % 100 < nullPct). Non-null values are sequential integers
+  /// cast to the native type.
+  VectorPtr makeColumn(const TypePtr& type, int32_t numRows, int32_t nullPct) {
+    std::function<bool(vector_size_t)> isNull;
+    if (nullPct == 100) {
+      isNull = [](auto) { return true; };
+    } else if (nullPct > 0) {
+      isNull = [nullPct](vector_size_t row) { return (row % 100) < nullPct; };
+    }
+
+    switch (type->kind()) {
+      case TypeKind::BOOLEAN:
+        return makeFlatVector<bool>(
+            numRows, [](auto row) { return row % 2 == 0; }, isNull);
+      case TypeKind::TINYINT:
+        return makeFlatVector<int8_t>(
+            numRows, [](auto row) { return static_cast<int8_t>(row); }, isNull);
+      case TypeKind::SMALLINT:
+        return makeFlatVector<int16_t>(
+            numRows,
+            [](auto row) { return static_cast<int16_t>(row); },
+            isNull);
+      case TypeKind::INTEGER:
+        return makeFlatVector<int32_t>(
+            numRows, [](auto row) { return row; }, isNull);
+      case TypeKind::BIGINT:
+        // Handles plain BIGINT and short-decimal columns (DECIMAL(p,s), p≤18).
+        return makeFlatVector<int64_t>(
+            numRows,
+            [](auto row) { return static_cast<int64_t>(row); },
+            isNull,
+            type);
+      case TypeKind::REAL:
+        return makeFlatVector<float>(
+            numRows, [](auto row) { return static_cast<float>(row); }, isNull);
+      case TypeKind::DOUBLE:
+        return makeFlatVector<double>(
+            numRows, [](auto row) { return static_cast<double>(row); }, isNull);
+      case TypeKind::HUGEINT:
+        // Handles long-decimal columns (DECIMAL(p,s), p>18).
+        return makeFlatVector<int128_t>(
+            numRows,
+            [](auto row) { return static_cast<int128_t>(row); },
+            isNull,
+            type);
+      default:
+        VELOX_NYI(
+            "makeColumn does not support complex type {} yet",
+            type->toString());
+    }
+  }
+
+  /// Generates input batches for the exchange benchmark.
+  ///
+  /// `dictPct` is the percentage of vectors for each column that should be
+  /// wrapped in dictionary encoding across the full set of generated batches.
+  /// For example, with `numVectors = 10` and `dictPct = 30`, each top-level
+  /// column will have 3 dictionary-encoded vectors and 7 simple vectors.
+  /// Nested children of complex columns use the same rule recursively.
+  ///
+  /// `nullPct` controls what fraction of values in each column are null:
+  /// 0 = no nulls, 50 = half the rows null, 100 = all rows null.
   std::vector<RowVectorPtr> makeRows(
-      RowTypePtr type,
+      const RowTypePtr& type,
       int32_t numVectors,
       int32_t rowsPerVector,
-      int32_t dictPct = 0) {
+      int32_t dictPct = 0,
+      int32_t nullPct = 0) {
     std::vector<RowVectorPtr> vectors;
-    BufferPtr indices;
+    vectors.reserve(numVectors);
     for (int32_t i = 0; i < numVectors; ++i) {
-      auto vector = std::dynamic_pointer_cast<RowVector>(
-          BatchMaker::createBatch(type, rowsPerVector, *pool_));
-
-      auto width = vector->childrenSize();
-      for (auto child = 0; child < width; ++child) {
-        if (100 * child / width > dictPct) {
-          if (!indices) {
-            indices = makeIndices(vector->size(), [&](auto i) { return i; });
-          }
-          vector->childAt(child) = BaseVector::wrapInDictionary(
-              nullptr, indices, vector->size(), vector->childAt(child));
+      std::vector<VectorPtr> children;
+      children.reserve(type->size());
+      for (int32_t col = 0; col < type->size(); ++col) {
+        children.push_back(
+            makeColumn(type->childAt(col), rowsPerVector, nullPct));
+      }
+      auto vector = makeRowVector(type->names(), children);
+      if (shouldWrapVector(i, numVectors, dictPct)) {
+        for (auto child = 0; child < vector->childrenSize(); ++child) {
+          wrapDictionaryRecursive(vector->childAt(child));
         }
       }
-      vectors.push_back(vector);
+      vectors.push_back(std::move(vector));
     }
     return vectors;
   }
 
-  void run(
-      std::vector<RowVectorPtr>& vectors,
+  ExchangeRunStats run(
+      const std::vector<RowVectorPtr>& vectors,
       int32_t width,
       int32_t taskWidth,
-      bool useOptimizedPartitionedOutput,
-      int64_t& wallUs,
-      PlanNodeStats& partitionedOutputStats,
-      PlanNodeStats& exchangeStats) {
+      ExchangeMode mode) {
+    VELOX_CHECK(!vectors.empty());
+
     core::PlanNodePtr plan;
     core::PlanNodeId exchangeId;
     core::PlanNodeId leafPartitionedOutputId;
@@ -120,9 +464,7 @@ class ExchangeBenchmark : public VectorTestBase {
 
     const auto startUs = getCurrentTimeMicro();
     BENCHMARK_SUSPEND {
-      assert(!vectors.empty());
-      configSettings_[core::QueryConfig::kMaxPartitionedOutputBufferSize] =
-          fmt::format("{}", FLAGS_exchange_buffer_mb << 20);
+      configureQuerySettings(mode);
       const auto iteration = ++iteration_;
 
       // leafPlan: PartitionedOutput/kPartitioned(1) <-- Values(0)
@@ -143,7 +485,6 @@ class ExchangeBenchmark : public VectorTestBase {
 
       // finalAggPlan: PartitionedOutput/kPartitioned(2) <-- Agg/kSingle(1) <--
       // Exchange(0)
-      std::vector<std::string> finalAggTaskIds;
       core::PlanNodePtr finalAggPlan =
           exec::test::PlanBuilder()
               .exchange(leafPlan->outputType(), "Presto")
@@ -178,39 +519,44 @@ class ExchangeBenchmark : public VectorTestBase {
         .splits(finalAggSplits)
         .assertResults(expected);
 
+    ExchangeRunStats stats;
     BENCHMARK_SUSPEND {
-      wallUs = getCurrentTimeMicro() - startUs;
-      std::vector<int64_t> taskWallMs;
+      stats.wallUs = getCurrentTimeMicro() - startUs;
 
       for (const auto& task : leafTasks) {
         const auto& taskStats = task->taskStats();
-        taskWallMs.push_back(
-            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
         const auto& planStats = toPlanStats(taskStats);
         auto& taskPartitionedOutputStats =
             planStats.at(leafPartitionedOutputId);
-        partitionedOutputStats += taskPartitionedOutputStats;
+        stats.partitionedOutputStats += taskPartitionedOutputStats;
       }
 
       for (const auto& task : finalAggTasks) {
         const auto& taskStats = task->taskStats();
-        taskWallMs.push_back(
-            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
         const auto& planStats = toPlanStats(taskStats);
 
         auto& taskPartitionedOutputStats =
             planStats.at(finalAggPartitionedOutputId);
-        partitionedOutputStats += taskPartitionedOutputStats;
+        stats.partitionedOutputStats += taskPartitionedOutputStats;
 
         auto& taskExchangeStats = planStats.at(exchangeId);
-        exchangeStats += taskExchangeStats;
+        stats.exchangeStats += taskExchangeStats;
       }
     };
+
+    return stats;
   }
 
  private:
   static constexpr int64_t kMaxMemory = 6UL << 30; // 6GB
 
+  void configureQuerySettings(ExchangeMode mode) {
+    configSettings_[core::QueryConfig::kMaxPartitionedOutputBufferSize] =
+        fmt::format("{}", FLAGS_exchange_buffer_mb << 20);
+    configSettings_[core::QueryConfig::kOptimizedPartitionedOutputEnabled] =
+        mode == ExchangeMode::kOptimized ? "true" : "false";
+  }
+
   static std::string
   makeTaskId(int32_t iteration, const std::string& prefix, int num) {
     return fmt::format("local://{}-{}-{}", iteration, prefix, num);
@@ -257,130 +603,131 @@ int32_t ExchangeBenchmark::iteration_;
 
 std::unique_ptr<ExchangeBenchmark> bm;
 
-void runBenchmarks(bool optimizedPartitionedOutputEnabled = false) {
-  std::vector<std::string> flatNames = {"c0"};
-  std::vector<TypePtr> flatTypes = {BIGINT()};
-  std::vector<TypePtr> typeSelection = {
-      BOOLEAN(),
-      TINYINT(),
-      DECIMAL(20, 3),
-      INTEGER(),
-      BIGINT(),
-      REAL(),
-      DECIMAL(10, 2),
-      DOUBLE(),
-      VARCHAR()};
-
-  int64_t flatSize = 0;
-  // Add enough columns of different types to make a 10K row batch be
-  // flat_batch_mb in flat size.
-  while (flatSize * 10000 < static_cast<int64_t>(FLAGS_flat_batch_mb) << 20) {
-    flatNames.push_back(fmt::format("c{}", flatNames.size()));
-    assert(!flatNames.empty());
-    flatTypes.push_back(typeSelection[flatTypes.size() % typeSelection.size()]);
-    if (flatTypes.back()->isFixedWidth()) {
-      flatSize += flatTypes.back()->cppSizeInBytes();
-    } else {
-      flatSize += 20;
-    }
-  }
-  auto flatType = ROW(std::move(flatNames), std::move(flatTypes));
-
-  auto structType = ROW(
-      {{"c0", BIGINT()},
-       {"r1",
-        ROW(
-            {{"k2", BIGINT()},
-             {"r2",
-              ROW(
-                  {{"i1", BIGINT()},
-                   {"i2", BIGINT()},
-                   {"r3}, ROW({{s3", VARCHAR()},
-                   {"i5", INTEGER()},
-                   {"d5", DOUBLE()},
-                   {"b5", BOOLEAN()},
-                   {"a5", ARRAY(TINYINT())}})}})}});
-
-  auto deepType = ROW(
-      {{"c0", BIGINT()},
-       {"long_array_val", ARRAY(ARRAY(BIGINT()))},
-       {"array_val", ARRAY(VARCHAR())},
-       {"struct_val", ROW({{"s_int", INTEGER()}, {"s_array", ARRAY(REAL())}})},
-       {"map_val",
-        MAP(VARCHAR(),
-            MAP(BIGINT(),
-                ROW({{"s2_int", INTEGER()}, {"s2_string", VARCHAR()}})))}});
+void benchmarkExchange(
+    unsigned int iters,
+    const ExchangeInputSpec& input,
+    ExchangeMode mode,
+    int32_t dictPct,
+    int32_t nullPct) {
+  auto vectors = bm->makeRows(
+      input.type, input.numVectors, input.rowsPerVector, dictPct, nullPct);
+  auto stats = runBenchmarkIterations(iters, [&]() {
+    return bm->run(vectors, FLAGS_width, FLAGS_task_width, mode);
+  });
+  benchmarkResults.push_back(
+      {fmt::format("{}_dict{}_null{}", input.name, dictPct, nullPct),
+       mode,
+       std::move(stats)});
+}
 
-  std::vector<RowVectorPtr> flat10k(
-      bm->makeRows(flatType, 10, 10000, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> deep10k(
-      bm->makeRows(deepType, 10, 10000, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> flat50(
-      bm->makeRows(flatType, 2000, 50, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> deep50(
-      bm->makeRows(deepType, 2000, 50, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> struct1k(
-      bm->makeRows(structType, 100, 1000, FLAGS_dict_pct));
-
-  std::vector<std::pair<std::string, std::vector<RowVectorPtr>*>> exchangeCases{
-      {"Flat10K", &flat10k},
-      {"Flat50", &flat50},
-      {"Deep10K", &deep10k},
-      {"Deep50", &deep50},
-      {"Struct1K", &struct1k}};
-
-  std::vector<ExchangeRunStats> normalPartitionedOutputStats(
-      exchangeCases.size());
-  std::vector<ExchangeRunStats> optimizedPartitionedOutputStats(
-      exchangeCases.size());
-
-  for (size_t i = 0; i < exchangeCases.size(); ++i) {
-    const auto& name = exchangeCases[i].first;
-    folly::addBenchmark(
-        __FILE__,
-        fmt::format("exchange{}_normalPartitionedOutput", name),
-        [&, i]() {
-          bm->run(
-              *exchangeCases[i].second,
-              FLAGS_width,
-              FLAGS_task_width,
-              false,
-              normalPartitionedOutputStats[i].wallUs,
-              normalPartitionedOutputStats[i].partitionedOutputStats,
-              normalPartitionedOutputStats[i].exchangeStats);
-          return 1;
-        });
-    if (optimizedPartitionedOutputEnabled) {
-      folly::addBenchmark(
-          __FILE__,
-          fmt::format("exchange{}_optimizedPartitionedOutput", name),
-          [&, i]() {
-            bm->run(
-                *exchangeCases[i].second,
-                FLAGS_width,
-                FLAGS_task_width,
-                true,
-                optimizedPartitionedOutputStats[i].wallUs,
-                optimizedPartitionedOutputStats[i].partitionedOutputStats,
-                optimizedPartitionedOutputStats[i].exchangeStats);
-            return 1;
-          });
-    }
+#define EXCHANGE_BENCHMARK_NAMED_PARAM(name, param_name, ...) \
+  BENCHMARK_IMPL(                                             \
+      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),    \
+      FOLLY_PP_STRINGIZE(param_name),                         \
+      iters,                                                  \
+      unsigned,                                               \
+      iters) {                                                \
+    name(iters, ##__VA_ARGS__);                               \
   }
 
-  folly::runBenchmarks();
-
-  for (size_t i = 0; i < exchangeCases.size(); ++i) {
-    printExchangeStats(
-        exchangeCases[i].first, "normal", normalPartitionedOutputStats[i]);
-    if (optimizedPartitionedOutputEnabled) {
-      printExchangeStats(
-          exchangeCases[i].first,
-          "optimized",
-          optimizedPartitionedOutputStats[i]);
-    }
-  }
-}
+// ── Benchmarks: input spec × nullPct × mode ───────────────────────────────
+
+#define EXCHANGE_BENCHMARK_INPUT(                                     \
+    _case_name, _input_expr, _mode_name, _dict_pct, _null_pct, _mode) \
+  EXCHANGE_BENCHMARK_NAMED_PARAM(                                     \
+      benchmarkExchange,                                              \
+      _case_name##_dict##_dict_pct##_null##_null_pct##_##_mode_name,  \
+      _input_expr,                                                    \
+      ExchangeMode::_mode,                                            \
+      _dict_pct,                                                      \
+      _null_pct)
+
+#define EXCHANGE_BENCHMARK_MODES(                                      \
+    _case_name, _input_expr, _dict_pct, _null_pct)                     \
+  EXCHANGE_BENCHMARK_INPUT(                                            \
+      _case_name, _input_expr, normal, _dict_pct, _null_pct, kNormal); \
+  EXCHANGE_BENCHMARK_INPUT(                                            \
+      _case_name, _input_expr, optimized, _dict_pct, _null_pct, kOptimized)
+
+#define EXCHANGE_BENCHMARK_CASE(_case_name, _input_expr)    \
+  EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 0);  \
+  EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 50); \
+  EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 100)
+
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Boolean_col1,
+    makeInputSpec(SimpleColType::kBoolean, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Boolean_col4,
+    makeInputSpec(SimpleColType::kBoolean, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Boolean_col16,
+    makeInputSpec(SimpleColType::kBoolean, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Tinyint_col1,
+    makeInputSpec(SimpleColType::kTinyint, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Tinyint_col4,
+    makeInputSpec(SimpleColType::kTinyint, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Tinyint_col16,
+    makeInputSpec(SimpleColType::kTinyint, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Integer_col1,
+    makeInputSpec(SimpleColType::kInteger, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Integer_col4,
+    makeInputSpec(SimpleColType::kInteger, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Integer_col16,
+    makeInputSpec(SimpleColType::kInteger, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Bigint_col1,
+    makeInputSpec(SimpleColType::kBigint, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Bigint_col4,
+    makeInputSpec(SimpleColType::kBigint, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Bigint_col16,
+    makeInputSpec(SimpleColType::kBigint, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Hugeint_col1,
+    makeInputSpec(SimpleColType::kHugeint, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Hugeint_col4,
+    makeInputSpec(SimpleColType::kHugeint, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Hugeint_col16,
+    makeInputSpec(SimpleColType::kHugeint, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_LongDecimal_col1,
+    makeInputSpec(SimpleColType::kLongDecimal, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_LongDecimal_col4,
+    makeInputSpec(SimpleColType::kLongDecimal, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_LongDecimal_col16,
+    makeInputSpec(SimpleColType::kLongDecimal, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Double_col1,
+    makeInputSpec(SimpleColType::kDouble, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Double_col4,
+    makeInputSpec(SimpleColType::kDouble, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Double_col16,
+    makeInputSpec(SimpleColType::kDouble, 16));
+
+// The complex type benchmarks are temporarily disabled.
+// EXCHANGE_BENCHMARK_CASE(Deep10K, makeInputSpec(ExchangeInputKind::kDeep10K));
+// EXCHANGE_BENCHMARK_CASE(Deep50, makeInputSpec(ExchangeInputKind::kDeep50));
+// EXCHANGE_BENCHMARK_CASE(Struct1K,
+// makeInputSpec(ExchangeInputKind::kStruct1K));
+
+#undef EXCHANGE_BENCHMARK_CASE
+#undef EXCHANGE_BENCHMARK_MODES
+#undef EXCHANGE_BENCHMARK_INPUT
+#undef EXCHANGE_BENCHMARK_NAMED_PARAM
 
 } // namespace
 
@@ -396,7 +743,8 @@ int main(int argc, char** argv) {
   exec::ExchangeSource::registerFactory(exec::test::createLocalExchangeSource);
 
   bm = std::make_unique<ExchangeBenchmark>();
-  runBenchmarks();
+  folly::runBenchmarks();
+  printAllExchangeStats();
   bm.reset();
 
   return 0;

From b38390fa07b0eaf29ca183f9805b3bd30647de7c Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Fri, 1 May 2026 02:23:31 -0700
Subject: [PATCH 16/24] perf: Introduce OptimizedVectorHasher

The new OptimizedVectorHasher is up to 2-3x faster than VectorHasher.
---
 velox/exec/CMakeLists.txt                     |   2 +
 velox/exec/OptimizedVectorHasher.cpp          | 407 ++++++++++++++++++
 velox/exec/OptimizedVectorHasher.h            |  74 ++++
 velox/exec/benchmarks/CMakeLists.txt          |   9 +
 .../OptimizedVectorHasherBenchmark.cpp        | 385 +++++++++++++++++
 velox/exec/tests/CMakeLists.txt               |   1 +
 .../exec/tests/OptimizedVectorHasherTest.cpp  | 308 +++++++++++++
 7 files changed, 1186 insertions(+)
 create mode 100644 velox/exec/OptimizedVectorHasher.cpp
 create mode 100644 velox/exec/OptimizedVectorHasher.h
 create mode 100644 velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
 create mode 100644 velox/exec/tests/OptimizedVectorHasherTest.cpp

diff --git a/velox/exec/CMakeLists.txt b/velox/exec/CMakeLists.txt
index 53b88bd28e3..d77f0305bfd 100644
--- a/velox/exec/CMakeLists.txt
+++ b/velox/exec/CMakeLists.txt
@@ -72,6 +72,7 @@ velox_add_library(
   OperatorTraceWriter.cpp
   OperatorUtils.cpp
   OptimizedPartitionedOutput.cpp
+  OptimizedVectorHasher.cpp
   OrderBy.cpp
   OutputBuffer.cpp
   OutputBufferManager.cpp
@@ -178,6 +179,7 @@ velox_add_library(
   OperatorTraceWriter.h
   OperatorType.h
   OperatorUtils.h
+  OptimizedVectorHasher.h
   OrderBy.h
   OutputBuffer.h
   OutputBufferManager.h
diff --git a/velox/exec/OptimizedVectorHasher.cpp b/velox/exec/OptimizedVectorHasher.cpp
new file mode 100644
index 00000000000..507ffc9edb1
--- /dev/null
+++ b/velox/exec/OptimizedVectorHasher.cpp
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/OptimizedVectorHasher.h"
+
+#include "velox/common/base/SimdUtil.h"
+#include "velox/type/FloatingPointUtil.h"
+
+namespace facebook::velox::exec {
+namespace {
+
+template <bool typeProvidesCustomComparison, TypeKind Kind>
+uint64_t hashOne(const DecodedVector& decoded, vector_size_t index) {
+  if constexpr (
+      Kind == TypeKind::ROW || Kind == TypeKind::ARRAY ||
+      Kind == TypeKind::MAP) {
+    return decoded.base()->hashValueAt(decoded.index(index));
+  } else {
+    using T = typename KindToFlatVector<Kind>::HashRowType;
+    const T value = decoded.valueAt<T>(index);
+
+    if constexpr (typeProvidesCustomComparison) {
+      return static_cast<const CanProvideCustomComparisonType<Kind>*>(
+                 decoded.base()->type().get())
+          ->hash(value);
+    } else if constexpr (std::is_floating_point_v<T>) {
+      return util::floating_point::NaNAwareHash<T>()(value);
+    } else {
+      return folly::hasher<T>()(value);
+    }
+  }
+}
+
+constexpr uint64_t kNullHash = OptimizedVectorHasher::kNullHash;
+
+// Fills `result[0..size)` with `hash`, mixing into the existing values when
+// `Mix` is true.
+template <bool Mix>
+inline void broadcastHash(vector_size_t size, uint64_t* result, uint64_t hash) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = bits::hashMix(result[i], hash);
+    }
+  } else {
+    std::fill(result, result + size, hash);
+  }
+}
+
+// Computes one hash per row via `computeHash(i)`. Caller guarantees no nulls.
+template <bool Mix, typename ComputeHash>
+inline void
+hashLoopNoNulls(vector_size_t size, uint64_t* result, ComputeHash computeHash) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = bits::hashMix(result[i], computeHash(i));
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = computeHash(i);
+    }
+  }
+}
+
+// Computes one hash per row, substituting `kNullHash` for null rows.
+template <bool Mix, typename ComputeHash>
+inline void hashLoopWithNulls(
+    vector_size_t size,
+    uint64_t* result,
+    const DecodedVector& decoded,
+    ComputeHash computeHash) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      const uint64_t hash = decoded.isNullAt(i) ? kNullHash : computeHash(i);
+      result[i] = bits::hashMix(result[i], hash);
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = decoded.isNullAt(i) ? kNullHash : computeHash(i);
+    }
+  }
+}
+
+template <bool Mix>
+inline void scatterDictionaryHashes(
+    vector_size_t size,
+    uint64_t* result,
+    const vector_size_t* indices,
+    const uint64_t* baseHashes) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = bits::hashMix(result[i], baseHashes[indices[i]]);
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = baseHashes[indices[i]];
+    }
+  }
+}
+
+template <bool Mix>
+inline void scatterDictionaryHashesWithExtraNulls(
+    vector_size_t size,
+    uint64_t* result,
+    const vector_size_t* indices,
+    const uint64_t* nulls,
+    const uint64_t* baseHashes) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      const uint64_t hash =
+          bits::isBitNull(nulls, i) ? kNullHash : baseHashes[indices[i]];
+      result[i] = bits::hashMix(result[i], hash);
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] =
+          bits::isBitNull(nulls, i) ? kNullHash : baseHashes[indices[i]];
+    }
+  }
+}
+
+/// converts Velox’s packed boolean storage into one hash per row.
+/// @param values: a bitmap: one bit per row, where set means true and unset
+/// means false
+template <bool Mix>
+inline void scatterBoolHashes(
+    vector_size_t size,
+    uint64_t* result,
+    const uint64_t* values,
+    const uint64_t* nulls) {
+  using Batch = xsimd::batch<int64_t>;
+  static constexpr vector_size_t kSimdBatchSize = Batch::size;
+  const auto falseHash = folly::hasher<bool>()(false);
+  const auto trueHash = folly::hasher<bool>()(true);
+
+  vector_size_t row{0};
+  if constexpr (!Mix) {
+    const auto falseHashBatch =
+        xsimd::broadcast<int64_t>(static_cast<int64_t>(falseHash));
+    const auto trueHashBatch =
+        xsimd::broadcast<int64_t>(static_cast<int64_t>(trueHash));
+    const auto nullHashBatch =
+        xsimd::broadcast<int64_t>(static_cast<int64_t>(kNullHash));
+    auto* const signedResult = reinterpret_cast<int64_t*>(result);
+
+    for (; row + kSimdBatchSize <= size; row += kSimdBatchSize) {
+      const auto bitOffset = row & 63;
+      const auto valueBits = (values[row / 64] >> bitOffset) &
+          bits::lowMask(static_cast<int32_t>(kSimdBatchSize));
+      auto hashes = xsimd::select(
+          simd::fromBitMask<int64_t>(valueBits), trueHashBatch, falseHashBatch);
+
+      if (nulls != nullptr) {
+        const auto notNullBits = (nulls[row / 64] >> bitOffset) &
+            bits::lowMask(static_cast<int32_t>(kSimdBatchSize));
+        hashes = xsimd::select(
+            simd::fromBitMask<int64_t>(notNullBits), hashes, nullHashBatch);
+      }
+
+      hashes.store_unaligned(signedResult + row);
+    }
+  }
+
+  // TODO: improve performance
+  for (; row < size; ++row) {
+    const auto hash = nulls != nullptr && bits::isBitNull(nulls, row)
+        ? kNullHash
+        : (bits::isBitSet(values, row) ? trueHash : falseHash);
+    if constexpr (Mix) {
+      result[row] = bits::hashMix(result[row], hash);
+    } else {
+      result[row] = hash;
+    }
+  }
+}
+
+// Dispatches `body` with `Mix` resolved as a compile-time bool.
+template <typename Body>
+inline void dispatchMix(bool mix, Body body) {
+  if (mix) {
+    body(std::true_type{});
+  } else {
+    body(std::false_type{});
+  }
+}
+
+template <typename ComputeHash>
+inline void hashDecoded(
+    bool mix,
+    vector_size_t size,
+    uint64_t* result,
+    const DecodedVector& decoded,
+    ComputeHash computeHash) {
+  dispatchMix(mix, [&](auto mixTag) {
+    constexpr bool kMix = decltype(mixTag)::value;
+    if (decoded.mayHaveNulls()) {
+      hashLoopWithNulls<kMix>(size, result, decoded, computeHash);
+    } else {
+      hashLoopNoNulls<kMix>(size, result, computeHash);
+    }
+  });
+}
+
+} // namespace
+
+OptimizedVectorHasher::OptimizedVectorHasher(
+    TypePtr type,
+    column_index_t channel)
+    : channel_(channel),
+      type_(std::move(type)),
+      typeKind_(type_->kind()),
+      typeProvidesCustomComparison_(type_->providesCustomComparison()) {}
+
+void OptimizedVectorHasher::decode(
+    const BaseVector& vector,
+    const SelectivityVector& rows) {
+  VELOX_CHECK(
+      type_->kindEquals(vector.type()),
+      "Type mismatch: {} vs. {}",
+      type_->toString(),
+      vector.type()->toString());
+  decoded_.decode(vector, rows);
+}
+
+void OptimizedVectorHasher::hash(bool mix, raw_vector<uint64_t>& result) {
+  if (typeKind_ == TypeKind::UNKNOWN) {
+    dispatchMix(mix, [&](auto mixTag) {
+      broadcastHash<decltype(mixTag)::value>(
+          decoded_.size(), result.data(), kNullHash);
+    });
+  } else {
+    VELOX_DYNAMIC_TYPE_DISPATCH(hashValues, typeKind_, mix, result.data());
+  }
+}
+
+void OptimizedVectorHasher::hash(
+    const SelectivityVector& rows,
+    bool mix,
+    raw_vector<uint64_t>& result) {
+  if (decoded_.size() == 0 || result.empty() || rows.isAllSelected()) {
+    hash(mix, result);
+    return;
+  }
+
+  const auto original = result;
+
+  hash(mix, result);
+
+  // The specialized hash() path computes values for the full decoded extent.
+  // Restore rows that were not selected to match VectorHasher semantics.
+  for (vector_size_t row = 0; row < result.size(); ++row) {
+    if (!rows.isValid(row)) {
+      result[row] = original[row];
+    }
+  }
+}
+
+template <TypeKind Kind>
+void OptimizedVectorHasher::hashValues(bool mix, uint64_t* result) {
+  using T = typename TypeTraits<Kind>::NativeType;
+  if constexpr (
+      Kind == TypeKind::ROW || Kind == TypeKind::ARRAY ||
+      Kind == TypeKind::MAP) {
+    if (typeProvidesCustomComparison_) {
+      hashTyped<true, Kind>(mix, result);
+    } else {
+      hashTyped<false, Kind>(mix, result);
+    }
+    return;
+  }
+
+  if (decoded_.isConstantMapping() || !decoded_.isIdentityMapping() ||
+      typeProvidesCustomComparison_) {
+    if (typeProvidesCustomComparison_) {
+      hashTyped<true, Kind>(mix, result);
+    } else {
+      hashTyped<false, Kind>(mix, result);
+    }
+    return;
+  }
+  hashFlatValues<T>(mix, result);
+}
+
+template <bool typeProvidesCustomComparison, TypeKind Kind>
+void OptimizedVectorHasher::hashTyped(bool mix, uint64_t* result) {
+  const auto size = decoded_.size();
+
+  // Constant column: compute the value once and broadcast.
+  if (decoded_.isConstantMapping()) {
+    const uint64_t hash = decoded_.isNullAt(0)
+        ? kNullHash
+        : hashOne<typeProvidesCustomComparison, Kind>(decoded_, 0);
+    dispatchMix(mix, [&](auto mixTag) {
+      broadcastHash<decltype(mixTag)::value>(size, result, hash);
+    });
+    return;
+  }
+
+  // Dictionary mapping more rows than its base: calculate the hashes for the
+  // dictionary first, then scatter.
+  if (!decoded_.isIdentityMapping() && size > decoded_.base()->size()) {
+    const DecodedVector baseDecoded(*decoded_.base());
+    const auto baseSize = decoded_.base()->size();
+    dictionaryHashes_.resize(baseSize);
+    const auto computeBaseHash = [&](vector_size_t i) {
+      return hashOne<typeProvidesCustomComparison, Kind>(baseDecoded, i);
+    };
+    hashDecoded(
+        false,
+        baseSize,
+        dictionaryHashes_.data(),
+        baseDecoded,
+        computeBaseHash);
+
+    const auto* const indices = decoded_.indices();
+    dispatchMix(mix, [&](auto mixTag) {
+      constexpr bool kMix = decltype(mixTag)::value;
+      if (decoded_.hasExtraNulls()) {
+        scatterDictionaryHashesWithExtraNulls<kMix>(
+            size, result, indices, decoded_.nulls(), dictionaryHashes_.data());
+      } else {
+        scatterDictionaryHashes<kMix>(
+            size, result, indices, dictionaryHashes_.data());
+      }
+    });
+    return;
+  }
+
+  // Generic fallback
+  const auto computeHash = [&](vector_size_t i) {
+    return hashOne<typeProvidesCustomComparison, Kind>(decoded_, i);
+  };
+  hashDecoded(mix, size, result, decoded_, computeHash);
+}
+
+template <typename T>
+void OptimizedVectorHasher::hashFlatValues(bool mix, uint64_t* result) {
+  if constexpr (std::is_void_v<T>) {
+    VELOX_NYI();
+  } else {
+    const T* const values = decoded_.data<T>();
+    const auto size = decoded_.size();
+    const auto computeHash = [&](vector_size_t i) {
+      if constexpr (std::is_floating_point_v<T>) {
+        return util::floating_point::NaNAwareHash<T>()(values[i]);
+      } else {
+        return folly::hasher<T>()(values[i]);
+      }
+    };
+    hashDecoded(mix, size, result, decoded_, computeHash);
+  }
+}
+
+template <>
+void OptimizedVectorHasher::hashFlatValues<bool>(bool mix, uint64_t* result) {
+  const auto* const values = decoded_.data<uint64_t>();
+  const auto* const nulls =
+      decoded_.mayHaveNulls() ? decoded_.nulls() : nullptr;
+  dispatchMix(mix, [&](auto mixTag) {
+    scatterBoolHashes<decltype(mixTag)::value>(
+        decoded_.size(), result, values, nulls);
+  });
+}
+
+void OptimizedVectorHasher::hashPrecomputed(
+    bool mix,
+    raw_vector<uint64_t>& result) const {
+  dispatchMix(mix, [&](auto mixTag) {
+    broadcastHash<decltype(mixTag)::value>(
+        result.size(), result.data(), precomputedHash_);
+  });
+}
+
+void OptimizedVectorHasher::precompute(const BaseVector& value) {
+  if (value.isNullAt(0)) {
+    precomputedHash_ = kNullHash;
+    return;
+  }
+
+  decoded_.decode(value);
+  if (typeKind_ == TypeKind::UNKNOWN) {
+    precomputedHash_ = kNullHash;
+    return;
+  }
+
+  if (typeProvidesCustomComparison_) {
+    precomputedHash_ = VELOX_DYNAMIC_TEMPLATE_TYPE_DISPATCH(
+        hashOne, true, typeKind_, decoded_, 0);
+  } else {
+    precomputedHash_ = VELOX_DYNAMIC_TEMPLATE_TYPE_DISPATCH(
+        hashOne, false, typeKind_, decoded_, 0);
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedVectorHasher.h b/velox/exec/OptimizedVectorHasher.h
new file mode 100644
index 00000000000..830b453abe8
--- /dev/null
+++ b/velox/exec/OptimizedVectorHasher.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/common/memory/RawVector.h"
+#include "velox/exec/Operator.h"
+#include "velox/vector/VectorTypeUtils.h"
+
+namespace facebook::velox::exec {
+
+class OptimizedVectorHasher {
+ public:
+  OptimizedVectorHasher(TypePtr type, column_index_t channel);
+
+  static std::unique_ptr<OptimizedVectorHasher> create(
+      TypePtr type,
+      column_index_t channel) {
+    return std::make_unique<OptimizedVectorHasher>(std::move(type), channel);
+  }
+
+  column_index_t channel() const {
+    return channel_;
+  }
+
+  // Decodes the 'vector' in preparation for calling hash() or
+  // computeValueIds(). The decoded vector can be accessed via decodedVector()
+  // getter.
+  void decode(const BaseVector& vector, const SelectivityVector& rows);
+
+  void hash(bool mix, raw_vector<uint64_t>& result);
+
+  void
+  hash(const SelectivityVector& rows, bool mix, raw_vector<uint64_t>& result);
+
+  void hashPrecomputed(bool mix, raw_vector<uint64_t>& result) const;
+
+  void precompute(const BaseVector& value);
+
+  static constexpr uint64_t kNullHash = BaseVector::kNullHash;
+
+  template <TypeKind Kind>
+  void hashValues(bool mix, uint64_t* result);
+
+ private:
+  template <bool typeProvidesCustomComparison, TypeKind Kind>
+  void hashTyped(bool mix, uint64_t* result);
+
+  template <typename T>
+  void hashFlatValues(bool mix, uint64_t* result);
+
+  const column_index_t channel_;
+  const TypePtr type_;
+  const TypeKind typeKind_;
+  const bool typeProvidesCustomComparison_;
+
+  DecodedVector decoded_;
+  raw_vector<uint64_t> dictionaryHashes_;
+  uint64_t precomputedHash_{0};
+};
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/benchmarks/CMakeLists.txt b/velox/exec/benchmarks/CMakeLists.txt
index 2f97200c904..3962d439833 100644
--- a/velox/exec/benchmarks/CMakeLists.txt
+++ b/velox/exec/benchmarks/CMakeLists.txt
@@ -20,6 +20,15 @@ target_link_libraries(
   Folly::follybenchmark
 )
 
+add_executable(velox_exec_optimized_vector_hasher_benchmark OptimizedVectorHasherBenchmark.cpp)
+
+target_link_libraries(
+  velox_exec_optimized_vector_hasher_benchmark
+  velox_exec
+  velox_vector_test_lib
+  Folly::follybenchmark
+)
+
 add_executable(velox_filter_project_benchmark FilterProjectBenchmark.cpp)
 
 target_link_libraries(
diff --git a/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
new file mode 100644
index 00000000000..5dffdd7cbf3
--- /dev/null
+++ b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <numeric>
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/exec/OptimizedVectorHasher.h"
+#include "velox/exec/VectorHasher.h"
+#include "velox/type/HugeInt.h"
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/tests/utils/VectorMaker.h"
+
+// Add the following definitions to allow Clion runs.
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+enum class NullMode {
+  kNoNulls,
+  kHalfNulls,
+  kAllNulls,
+};
+
+enum class EncodingMode {
+  kFlat,
+  kDictionary,
+  kConstant,
+};
+
+template <typename T>
+T makeValue(vector_size_t row) {
+  return static_cast<T>((row * 8191) ^ (row >> 3));
+}
+
+template <>
+bool makeValue<bool>(vector_size_t row) {
+  return (row & 1) == 0;
+}
+
+template <>
+float makeValue<float>(vector_size_t row) {
+  return static_cast<float>(row) * 1.25f - 1000.0f;
+}
+
+template <>
+double makeValue<double>(vector_size_t row) {
+  return static_cast<double>(row) * 1.25 - 1000.0;
+}
+
+template <>
+int128_t makeValue<int128_t>(vector_size_t row) {
+  return HugeInt::build(
+      static_cast<int64_t>(row * 31),
+      static_cast<uint64_t>(row * 1315423911ULL + 17));
+}
+
+template <>
+StringView makeValue<StringView>(vector_size_t row) {
+  thread_local std::array<char, 20> buffer;
+  const auto length = 5 + row % 16;
+  for (vector_size_t i = 0; i < length; ++i) {
+    buffer[i] = 'a' + (row + i * 7) % 26;
+  }
+  return StringView(buffer.data(), length);
+}
+
+std::function<bool(vector_size_t)> makeNulls(NullMode nullMode) {
+  switch (nullMode) {
+    case NullMode::kNoNulls:
+      return nullptr;
+    case NullMode::kHalfNulls:
+      return [](vector_size_t row) { return (row & 1) == 0; };
+    case NullMode::kAllNulls:
+      return [](vector_size_t /*row*/) { return true; };
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+template <typename T>
+VectorPtr makeValuesVector(
+    VectorMaker& vectorMaker,
+    memory::MemoryPool* pool,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    vector_size_t numValues,
+    vector_size_t dictionarySize) {
+  auto flat = vectorMaker.flatVector<T>(
+      encodingMode == EncodingMode::kDictionary ? dictionarySize : numValues,
+      [](vector_size_t row) { return makeValue<T>(row); },
+      makeNulls(nullMode));
+
+  switch (encodingMode) {
+    case EncodingMode::kFlat:
+      return flat;
+    case EncodingMode::kDictionary: {
+      auto indices = AlignedBuffer::allocate<vector_size_t>(numValues, pool);
+      auto* rawIndices = indices->asMutable<vector_size_t>();
+      for (vector_size_t i = 0; i < numValues; ++i) {
+        rawIndices[i] = (numValues - i - 1) % dictionarySize;
+      }
+      return BaseVector::wrapInDictionary(
+          BufferPtr(nullptr), indices, numValues, flat);
+    }
+    case EncodingMode::kConstant:
+      if (nullMode == NullMode::kAllNulls) {
+        return BaseVector::createNullConstant(
+            CppToType<T>::create(), numValues, pool);
+      }
+      return BaseVector::wrapInConstant(numValues, 0, flat);
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+template <typename Hasher>
+struct HasherRunner;
+
+template <>
+struct HasherRunner<VectorHasher> {
+  static std::unique_ptr<VectorHasher> create(const TypePtr& type) {
+    return VectorHasher::create(type, 0);
+  }
+};
+
+template <>
+struct HasherRunner<OptimizedVectorHasher> {
+  static std::unique_ptr<OptimizedVectorHasher> create(const TypePtr& type) {
+    return OptimizedVectorHasher::create(type, 0);
+  }
+};
+
+template <typename T, typename Hasher>
+void runHashBenchmark(
+    uint32_t iterations,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    bool mix,
+    vector_size_t size,
+    vector_size_t dictionarySize) {
+  folly::BenchmarkSuspender suspender;
+
+  auto pool = memory::memoryManager()->addLeafPool();
+  VectorMaker vectorMaker(pool.get());
+  auto values = makeValuesVector<T>(
+      vectorMaker, pool.get(), nullMode, encodingMode, size, dictionarySize);
+  auto hasher = HasherRunner<Hasher>::create(CppToType<T>::create());
+  raw_vector<uint64_t> hashes(size, pool.get());
+
+  SelectivityVector rows(size);
+  hasher->decode(*values, rows);
+  if (mix) {
+    std::iota(hashes.begin(), hashes.end(), 0);
+  }
+
+  suspender.dismiss();
+
+  for (uint32_t i = 0; i < iterations; ++i) {
+    hasher->hash(rows, mix, hashes);
+    folly::doNotOptimizeAway(hashes.data());
+  }
+}
+
+template <typename T>
+void benchmarkVectorHasher(
+    uint32_t iterations,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    bool mix,
+    vector_size_t size,
+    vector_size_t dictionarySize) {
+  runHashBenchmark<T, VectorHasher>(
+      iterations, nullMode, encodingMode, mix, size, dictionarySize);
+}
+
+template <typename T>
+void benchmarkOptimizedVectorHasher(
+    uint32_t iterations,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    bool mix,
+    vector_size_t size,
+    vector_size_t dictionarySize) {
+  runHashBenchmark<T, OptimizedVectorHasher>(
+      iterations, nullMode, encodingMode, mix, size, dictionarySize);
+}
+
+#define REGISTER_HASHER_PAIR(                                                      \
+    T,                                                                             \
+    TYPE_NAME,                                                                     \
+    NULL_MODE,                                                                     \
+    NULL_NAME,                                                                     \
+    ENCODING_MODE,                                                                 \
+    ENCODING_NAME,                                                                 \
+    MIX,                                                                           \
+    MIX_NAME,                                                                      \
+    SIZE,                                                                          \
+    DICTIONARY_SIZE)                                                               \
+  BENCHMARK(                                                                       \
+      TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME##_##SIZE, n) {        \
+    benchmarkVectorHasher<T>(                                                      \
+        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);                  \
+  }                                                                                \
+  BENCHMARK_RELATIVE(                                                              \
+      optimized_##TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME##_##SIZE, \
+      n) {                                                                         \
+    benchmarkOptimizedVectorHasher<T>(                                             \
+        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);                  \
+  }                                                                                \
+  BENCHMARK_DRAW_LINE();
+
+#define REGISTER_HASHER_NULL_MODES( \
+    T,                              \
+    TYPE_NAME,                      \
+    ENCODING_MODE,                  \
+    ENCODING_NAME,                  \
+    MIX,                            \
+    MIX_NAME,                       \
+    SIZE,                           \
+    DICTIONARY_SIZE)                \
+  REGISTER_HASHER_PAIR(             \
+      T,                            \
+      TYPE_NAME,                    \
+      NullMode::kNoNulls,           \
+      no_null,                      \
+      ENCODING_MODE,                \
+      ENCODING_NAME,                \
+      MIX,                          \
+      MIX_NAME,                     \
+      SIZE,                         \
+      DICTIONARY_SIZE)              \
+  REGISTER_HASHER_PAIR(             \
+      T,                            \
+      TYPE_NAME,                    \
+      NullMode::kHalfNulls,         \
+      half_null,                    \
+      ENCODING_MODE,                \
+      ENCODING_NAME,                \
+      MIX,                          \
+      MIX_NAME,                     \
+      SIZE,                         \
+      DICTIONARY_SIZE)              \
+  REGISTER_HASHER_PAIR(             \
+      T,                            \
+      TYPE_NAME,                    \
+      NullMode::kAllNulls,          \
+      all_null,                     \
+      ENCODING_MODE,                \
+      ENCODING_NAME,                \
+      MIX,                          \
+      MIX_NAME,                     \
+      SIZE,                         \
+      DICTIONARY_SIZE)
+
+#define REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, SIZE) \
+  REGISTER_HASHER_PAIR(                                                        \
+      T,                                                                       \
+      TYPE_NAME,                                                               \
+      NullMode::kNoNulls,                                                      \
+      no_null,                                                                 \
+      EncodingMode::kConstant,                                                 \
+      constant,                                                                \
+      MIX,                                                                     \
+      MIX_NAME,                                                                \
+      SIZE,                                                                    \
+      SIZE)                                                                    \
+  REGISTER_HASHER_PAIR(                                                        \
+      T,                                                                       \
+      TYPE_NAME,                                                               \
+      NullMode::kAllNulls,                                                     \
+      all_null,                                                                \
+      EncodingMode::kConstant,                                                 \
+      constant,                                                                \
+      MIX,                                                                     \
+      MIX_NAME,                                                                \
+      SIZE,                                                                    \
+      SIZE)
+
+#define REGISTER_HASHER_SIZES(                                                 \
+    T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME)                 \
+  REGISTER_HASHER_NULL_MODES(                                                  \
+      T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME, 10000, 10000) \
+  REGISTER_HASHER_NULL_MODES(                                                  \
+      T,                                                                       \
+      TYPE_NAME,                                                               \
+      ENCODING_MODE,                                                           \
+      ENCODING_NAME,                                                           \
+      MIX,                                                                     \
+      MIX_NAME,                                                                \
+      1000000,                                                                 \
+      1000000)
+
+#define REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME)       \
+  REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 10000) \
+  REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 1000000)
+
+#define REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(         \
+    T, TYPE_NAME, MIX, MIX_NAME, SIZE, PERCENT, PERCENT_NAME) \
+  REGISTER_HASHER_NULL_MODES(                                 \
+      T,                                                      \
+      TYPE_NAME,                                              \
+      EncodingMode::kDictionary,                              \
+      dictionary_##PERCENT_NAME,                              \
+      MIX,                                                    \
+      MIX_NAME,                                               \
+      SIZE,                                                   \
+      SIZE* PERCENT / 100)
+
+#define REGISTER_HASHER_SIZES_DICTIONARY(T, TYPE_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 80, 80pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 60, 60pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 40, 40pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 20, 20pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 5, 5pct)                    \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 80, 80pct)                \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 60, 60pct)                \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 40, 40pct)                \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 20, 20pct)                \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 5, 5pct)
+
+#define REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, MIX, MIX_NAME)  \
+  REGISTER_HASHER_SIZES(                                        \
+      T, TYPE_NAME, EncodingMode::kFlat, flat, MIX, MIX_NAME)   \
+  REGISTER_HASHER_SIZES_DICTIONARY(T, TYPE_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME)
+
+#define REGISTER_HASHER_TYPE(T, TYPE_NAME)               \
+  REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, false, no_mix) \
+  REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, true, mix)
+
+REGISTER_HASHER_TYPE(bool, boolean)
+REGISTER_HASHER_TYPE(int32_t, integer)
+REGISTER_HASHER_TYPE(int64_t, bigint)
+REGISTER_HASHER_TYPE(int128_t, hugeint)
+REGISTER_HASHER_TYPE(float, real)
+REGISTER_HASHER_TYPE(double, double)
+REGISTER_HASHER_TYPE(StringView, varchar)
+
+#undef REGISTER_HASHER_TYPE
+#undef REGISTER_HASHER_SIZES_DICTIONARY
+#undef REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT
+#undef REGISTER_HASHER_SIZES
+#undef REGISTER_HASHER_NULL_MODES
+#undef REGISTER_HASHER_PAIR
+
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt
index 58b480478ef..119fcecc0bd 100644
--- a/velox/exec/tests/CMakeLists.txt
+++ b/velox/exec/tests/CMakeLists.txt
@@ -149,6 +149,7 @@ set(
   FilterProjectTest.cpp
   AsyncConnectorTest.cpp
   OptimizedPartitionedOutputTest.cpp
+  OptimizedVectorHasherTest.cpp
 )
 
 set(
diff --git a/velox/exec/tests/OptimizedVectorHasherTest.cpp b/velox/exec/tests/OptimizedVectorHasherTest.cpp
new file mode 100644
index 00000000000..e0a107b6fd4
--- /dev/null
+++ b/velox/exec/tests/OptimizedVectorHasherTest.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <gtest/gtest.h>
+
+#include "velox/common/base/tests/GTestUtils.h"
+#include "velox/exec/OptimizedVectorHasher.h"
+#include "velox/exec/VectorHasher.h"
+#include "velox/type/tests/utils/CustomTypesForTesting.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+class OptimizedVectorHasherTest : public testing::Test, public VectorTestBase {
+ protected:
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+  }
+
+  BufferPtr makeIndices(
+      vector_size_t size,
+      std::function<vector_size_t(vector_size_t)> indexAt) {
+    auto indices = AlignedBuffer::allocate<vector_size_t>(size, pool());
+    auto rawIndices = indices->asMutable<vector_size_t>();
+    for (vector_size_t i = 0; i < size; ++i) {
+      rawIndices[i] = indexAt(i);
+    }
+    return indices;
+  }
+
+  static SelectivityVector makeOddRows(vector_size_t size) {
+    SelectivityVector oddRows(size);
+    for (vector_size_t i = 0; i < size; i += 2) {
+      oddRows.setValid(i, false);
+    }
+    oddRows.updateBounds();
+    return oddRows;
+  }
+
+  void compareHashes(
+      const TypePtr& type,
+      const VectorPtr& vector,
+      const SelectivityVector& rows,
+      bool mix,
+      uint64_t seed = 0) {
+    auto expectedHasher = VectorHasher::create(type, 0);
+    auto actualHasher = OptimizedVectorHasher::create(type, 0);
+
+    raw_vector<uint64_t> expected(vector->size(), pool());
+    raw_vector<uint64_t> actual(vector->size(), pool());
+    if (mix) {
+      std::iota(expected.begin(), expected.end(), seed);
+      std::iota(actual.begin(), actual.end(), seed);
+    } else {
+      std::fill(expected.begin(), expected.end(), 0);
+      std::fill(actual.begin(), actual.end(), 0);
+    }
+
+    expectedHasher->decode(*vector, rows);
+    actualHasher->decode(*vector, rows);
+
+    expectedHasher->hash(rows, mix, expected);
+    actualHasher->hash(rows, mix, actual);
+
+    for (vector_size_t i = 0; i < vector->size(); ++i) {
+      EXPECT_EQ(expected[i], actual[i]) << "at " << i;
+    }
+  }
+
+  void comparePrecomputed(
+      const TypePtr& type,
+      const VectorPtr& value,
+      vector_size_t size,
+      bool mix,
+      uint64_t seed = 0) {
+    auto expectedHasher = VectorHasher::create(type, 0);
+    auto actualHasher = OptimizedVectorHasher::create(type, 0);
+
+    raw_vector<uint64_t> expected(size, pool());
+    raw_vector<uint64_t> actual(size, pool());
+    if (mix) {
+      std::iota(expected.begin(), expected.end(), seed);
+      std::iota(actual.begin(), actual.end(), seed);
+    } else {
+      std::fill(expected.begin(), expected.end(), 0);
+      std::fill(actual.begin(), actual.end(), 0);
+    }
+
+    const SelectivityVector rows(size);
+    expectedHasher->precompute(*value);
+    actualHasher->precompute(*value);
+
+    expectedHasher->hashPrecomputed(rows, mix, expected);
+    actualHasher->hashPrecomputed(mix, actual);
+
+    for (vector_size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(expected[i], actual[i]) << "at " << i;
+    }
+  }
+};
+
+TEST_F(OptimizedVectorHasherTest, flat) {
+  auto vector = BaseVector::create(BIGINT(), 100, pool());
+  auto flatVector = vector->asFlatVector<int64_t>();
+  for (vector_size_t i = 0; i < 100; ++i) {
+    if (i % 5 == 0) {
+      flatVector->setNull(i, true);
+    } else {
+      flatVector->set(i, i);
+    }
+  }
+
+  const SelectivityVector allRows(100);
+  const auto oddRows = makeOddRows(100);
+
+  compareHashes(BIGINT(), vector, oddRows, false);
+  compareHashes(BIGINT(), vector, allRows, false);
+  compareHashes(BIGINT(), vector, allRows, true, 10);
+
+  flatVector->setNull(0, true);
+  comparePrecomputed(BIGINT(), vector, 100, false);
+
+  flatVector->setNull(0, false);
+  flatVector->set(0, 7);
+  comparePrecomputed(BIGINT(), vector, 100, false);
+
+  flatVector->set(0, 55);
+  comparePrecomputed(BIGINT(), vector, 100, true, 20);
+}
+
+TEST_F(OptimizedVectorHasherTest, boolFlat) {
+  constexpr vector_size_t kSize = 137;
+  auto vector = makeFlatVector<bool>(
+      kSize,
+      [](vector_size_t row) { return row % 7 == 0 || row % 11 == 3; },
+      [](vector_size_t row) { return row % 13 == 5; });
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(BOOLEAN(), vector, oddRows, false);
+  compareHashes(BOOLEAN(), vector, allRows, false);
+  compareHashes(BOOLEAN(), vector, allRows, true, 17);
+
+  vector = makeFlatVector<bool>(
+      kSize, [](vector_size_t row) { return row % 5 < 2; });
+  compareHashes(BOOLEAN(), vector, allRows, false);
+  compareHashes(BOOLEAN(), vector, allRows, true, 23);
+}
+
+TEST_F(OptimizedVectorHasherTest, nans) {
+  static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
+  static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();
+  auto vector = makeFlatVector<double>({1.0, -1.0, kNaN, kSNaN, 0.0, -0.0});
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(DOUBLE(), vector, allRows, false);
+  compareHashes(DOUBLE(), vector, allRows, true, 15);
+}
+
+TEST_F(OptimizedVectorHasherTest, nonNullConstant) {
+  auto vector = BaseVector::createConstant(INTEGER(), 123, 6, pool());
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(INTEGER(), vector, oddRows, false);
+  compareHashes(INTEGER(), vector, allRows, false);
+  compareHashes(INTEGER(), vector, allRows, true, 7);
+}
+
+TEST_F(OptimizedVectorHasherTest, nullConstant) {
+  auto vector = BaseVector::createNullConstant(INTEGER(), 6, pool());
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(INTEGER(), vector, oddRows, false);
+  compareHashes(INTEGER(), vector, allRows, false);
+  compareHashes(INTEGER(), vector, allRows, true, 11);
+}
+
+TEST_F(OptimizedVectorHasherTest, unknown) {
+  auto vector = makeAllNullFlatVector<UnknownValue>(100);
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(UNKNOWN(), vector, oddRows, false);
+  compareHashes(UNKNOWN(), vector, allRows, false);
+  compareHashes(UNKNOWN(), vector, allRows, true, 0);
+}
+
+TEST_F(OptimizedVectorHasherTest, dictionary) {
+  auto base = makeNullableFlatVector<int64_t>({10, 20, std::nullopt, 40, 50});
+  constexpr vector_size_t kSize = 100;
+  auto dictionary = BaseVector::wrapInDictionary(
+      makeNulls(kSize, [&](vector_size_t row) { return row == 1 || row == 7; }),
+      makeIndices(kSize, [&](vector_size_t row) { return row % base->size(); }),
+      kSize,
+      base);
+  const SelectivityVector allRows(dictionary->size());
+  const auto oddRows = makeOddRows(dictionary->size());
+
+  compareHashes(BIGINT(), dictionary, oddRows, false);
+  compareHashes(BIGINT(), dictionary, allRows, false);
+  compareHashes(BIGINT(), dictionary, allRows, true, 10);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparison) {
+  auto vector = makeNullableFlatVector<int64_t>(
+      {0, 1, 256, 257, std::nullopt, 512, 513},
+      BIGINT_TYPE_WITH_CUSTOM_COMPARISON());
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), vector, allRows, false);
+  compareHashes(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), vector, allRows, true, 9);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparisonArray) {
+  auto vector = makeNullableArrayVector<int64_t>(
+      {{0, 1, 2},
+       {256, 257, 258},
+       {512, 513, 514},
+       {3, 4, 5},
+       {259, 260, 261},
+       {515, 516, 517},
+       {std::nullopt}},
+      ARRAY(BIGINT_TYPE_WITH_CUSTOM_COMPARISON()));
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(
+      ARRAY(BIGINT_TYPE_WITH_CUSTOM_COMPARISON()), vector, allRows, false);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparisonMap) {
+  auto vector = makeNullableMapVector<int64_t, int64_t>(
+      {std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {0, 10}, {1, 11}, {2, 12}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {256, 266}, {257, 267}, {258, 268}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {512, 522}, {513, 523}, {514, 524}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {3, 103}, {4, 104}, {5, 105}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {259, 359}, {260, 360}, {261, 361}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {515, 615}, {516, 616}, {517, 617}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {0, std::nullopt}}},
+      MAP(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(),
+          BIGINT_TYPE_WITH_CUSTOM_COMPARISON()));
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(
+      MAP(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(),
+          BIGINT_TYPE_WITH_CUSTOM_COMPARISON()),
+      vector,
+      allRows,
+      false);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparisonRow) {
+  auto vector = makeRowVector(
+      {"a"},
+      {makeNullableFlatVector<int64_t>(
+          {std::nullopt, 0, 1, 256, 257, 512, 513},
+          BIGINT_TYPE_WITH_CUSTOM_COMPARISON())});
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(vector->type(), vector, allRows, false);
+}
+
+TEST_F(OptimizedVectorHasherTest, precompute) {
+  auto value = makeNullableFlatVector<int64_t>({std::nullopt});
+  comparePrecomputed(BIGINT(), value, 100, false);
+
+  value = makeNullableFlatVector<int64_t>({7});
+  comparePrecomputed(BIGINT(), value, 100, false);
+
+  value = makeNullableFlatVector<int64_t>({55});
+  comparePrecomputed(BIGINT(), value, 100, true, 100);
+}
+
+TEST_F(OptimizedVectorHasherTest, typeMismatch) {
+  auto hasher = OptimizedVectorHasher::create(BIGINT(), 0);
+  auto vector = makeFlatVector<StringView>({"a", "b", "c"});
+  SelectivityVector rows(vector->size());
+
+  VELOX_ASSERT_THROW(
+      hasher->decode(*vector, rows), "Type mismatch: BIGINT vs. VARCHAR");
+}
+
+} // namespace

From fceb8bc59641c7db2563036ae3c4d38aaa6a74d8 Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Tue, 5 May 2026 15:48:45 +0100
Subject: [PATCH 17/24] feat(PartitionedOutput): fix test failures caused by
 listenerFactory

---
 velox/exec/tests/OptimizedPartitionedOutputTest.cpp       | 7 +++++++
 .../tests/PrestoIterativePartitioningSerializerTest.cpp   | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
index 86475cd0f41..4308aa77bfd 100644
--- a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
+++ b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
@@ -127,6 +127,13 @@ struct PartitionedOutputResult {
 /// Shared infrastructure for all OptimizedPartitionedOutput tests.
 class OptimizedPartitionedOutputTest : public OperatorTestBase {
  protected:
+  void SetUp() override {
+    OperatorTestBase::SetUp();
+    bufferManager_->setListenerFactory([]() {
+      return std::make_unique<serializer::presto::PrestoOutputStreamListener>();
+    });
+  }
+
   std::shared_ptr<core::QueryCtx> createQueryContext(
       std::unordered_map<std::string, std::string> config) {
     config[core::QueryConfig::kOptimizedPartitionedOutputEnabled] = "true";
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
index ea76dc11ab6..dcea3d08b7d 100644
--- a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -81,7 +81,13 @@ class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase {
       uint32_t numPartitions) {
     SerdeOpts opts;
     return std::make_unique<PrestoIterativePartitioningSerializer>(
-        type, numPartitions, opts, pool_.get());
+        type,
+        numPartitions,
+        opts,
+        pool_.get(),
+        []() -> std::unique_ptr<OutputStreamListener> {
+          return std::make_unique<PrestoOutputStreamListener>();
+        });
   }
 
   /// Builds a serializer that computes a CRC32 checksum on each flush via a

From 1760e3398b0dbc4538a3ba111793e0abd0c94932 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Wed, 22 Apr 2026 09:11:34 -0700
Subject: [PATCH 18/24] feat(PartitionedOutput): Improve
 PartitionedVectorBenchmark

BatchMaker always allocates a null buffer even when no rows are null.
This commit removes it so benchmarks measure the non-nullable path
faithfully. Plus some minor format cleanups.
---
 .../benchmarks/PartitionedVectorBenchmark.cpp | 131 +++++++++++++++---
 1 file changed, 109 insertions(+), 22 deletions(-)

diff --git a/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
index 681a2e0c188..8589bbec0a0 100644
--- a/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
+++ b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
@@ -13,15 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <algorithm>
 
-#include <absl/random/uniform_int_distribution.h>
 #include <folly/Benchmark.h>
 #include <folly/init/Init.h>
 
-#include <algorithm>
+#include "velox/vector/PartitionedVector.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
 
-#include "dwio/common/tests/utils/BatchMaker.h"
-#include "vector/PartitionedVector.h"
+// Add the following definitions to allow Clion runs
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
 
 using namespace facebook::velox;
 using namespace facebook::velox::test;
@@ -32,7 +34,7 @@ namespace {
 
 thread_local auto gen = std::mt19937(42);
 
-auto noNulls = [](vector_size_t) { return false; };
+const std::function<bool(vector_size_t)> noNulls;
 
 auto allNulls = [](vector_size_t) { return true; };
 
@@ -92,36 +94,122 @@ auto randomPartitionFunction = [](const RowVectorPtr& vector,
   }
 };
 
-std::shared_ptr<memory::MemoryPool> pool;
-std::vector<uint32_t> partitions;
+/// Builds benchmark row vectors, one column at a time.
+class VectorBuilder : public VectorTestBase {
+ public:
+  RowVectorPtr makeRowVector(
+      const RowTypePtr& rowType,
+      vector_size_t numRows,
+      const std::function<bool(vector_size_t)>& isNullAt) {
+    std::vector<VectorPtr> children;
+    children.reserve(rowType->size());
+    for (auto i = 0; i < rowType->size(); ++i) {
+      children.push_back(makeColumn(rowType->childAt(i), numRows, isNullAt));
+    }
+    return VectorTestBase::makeRowVector(children);
+  }
 
-RowVectorPtr createTestVector(
-    const std::function<RowTypePtr(int32_t)>& rowTypeGenerator,
-    vector_size_t numRows,
-    int32_t numColumns,
-    const std::function<bool(vector_size_t)>& isNullAt) {
-  auto rowType = rowTypeGenerator(numColumns);
-  const auto batch = BatchMaker::createBatch(rowType, numRows, *pool, isNullAt);
-  return std::static_pointer_cast<RowVector>(batch);
-}
+ private:
+  VectorPtr makeColumn(
+      const TypePtr& type,
+      vector_size_t size,
+      const std::function<bool(vector_size_t)>& isNullAt) {
+    switch (type->kind()) {
+      case TypeKind::BOOLEAN:
+        return makeFlatVector<bool>(
+            size, [](auto row) { return row % 2 == 0; }, isNullAt, type);
+      case TypeKind::TINYINT:
+        return makeFlatVector<int8_t>(
+            size,
+            [](auto row) { return static_cast<int8_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::SMALLINT:
+        return makeFlatVector<int16_t>(
+            size,
+            [](auto row) { return static_cast<int16_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::INTEGER:
+        if (type->isDate()) {
+          return makeFlatVector<int32_t>(
+              size,
+              [](auto row) { return static_cast<int32_t>(row); },
+              isNullAt,
+              type);
+        }
+        return makeFlatVector<int32_t>(
+            size, [](auto row) { return row; }, isNullAt, type);
+      case TypeKind::BIGINT:
+        return makeFlatVector<int64_t>(
+            size,
+            [](auto row) { return static_cast<int64_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::HUGEINT:
+        return makeFlatVector<int128_t>(
+            size,
+            [](auto row) { return static_cast<int128_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::REAL:
+        return makeFlatVector<float>(
+            size,
+            [](auto row) { return static_cast<float>(row); },
+            isNullAt,
+            type);
+      case TypeKind::DOUBLE:
+        return makeFlatVector<double>(
+            size,
+            [](auto row) { return static_cast<double>(row); },
+            isNullAt,
+            type);
+      case TypeKind::TIMESTAMP:
+        return makeFlatVector<Timestamp>(
+            size,
+            [](auto row) { return Timestamp(row, row * 1'000); },
+            isNullAt,
+            type);
+      case TypeKind::VARCHAR:
+      case TypeKind::VARBINARY:
+        // Alternate between short inlined strings (≤12 bytes) and long
+        // out-of-line strings (>12 bytes) to exercise both StringView paths.
+        return makeFlatVector<std::string>(
+            size,
+            [](auto row) -> std::string {
+              if (row % 2 == 0) {
+                return fmt::format("v-{}", row);
+              }
+              return fmt::format("velox_benchmark_string_{:08d}", row);
+            },
+            isNullAt,
+            type);
+      default:
+        VELOX_UNSUPPORTED("Unsupported benchmark type: {}", type->toString());
+    }
+  }
+};
 
 } // namespace
 
+/// Constructs all benchmark state and runs the benchmark. Called once per
+/// benchmark entry; construction is outside the timed region.
 void runBM(
     uint32_t iterations,
     const std::function<RowTypePtr(int32_t)>& rowTypeGenerator,
     int32_t numColumns,
     uint32_t numPartitions,
     const std::function<bool(vector_size_t)>& isNullAt = noNulls,
-    vector_size_t numRows = 10000) {
+    vector_size_t numRows = 10'000) {
   folly::BenchmarkSuspender suspender;
+  VectorBuilder vectorBuilder;
+  auto pool = memory::memoryManager()->addLeafPool();
   PartitionBuildContext ctx;
-  auto vector =
-      createTestVector(rowTypeGenerator, numRows, numColumns, isNullAt);
+  auto vector = vectorBuilder.makeRowVector(
+      rowTypeGenerator(numColumns), numRows, isNullAt);
+  std::vector<uint32_t> partitions;
   randomPartitionFunction(vector, numPartitions, partitions);
   for (uint32_t i = 0; i < iterations; ++i) {
-    // PartitionedVector::create mutates its input, so each iteration needs a
-    // fresh copy to keep inputs consistent.
     const auto vectorCopy = std::static_pointer_cast<RowVector>(
         BaseVector::copy(*vector, pool.get()));
     suspender.dismiss();
@@ -178,7 +266,6 @@ BENCHMARK_TYPE(Mixed, mixedFlatTypeGenerator);
 int main(int argc, char** argv) {
   folly::Init init{&argc, &argv};
   memory::MemoryManager::initialize(memory::MemoryManager::Options{});
-  pool = memory::memoryManager()->addLeafPool();
   folly::runBenchmarks();
   return 0;
 }

From bf93e3966164ccfdaccb3e56cbca1c40af55d0b6 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Sat, 9 May 2026 14:28:22 -0700
Subject: [PATCH 19/24] fix: Pass Velox CMake build type to Folly

Currently Velox never passes CMAKE_BUILD_TYPE into Folly's own configure
step, while cmake_install only forwards arbitrary caller flags, so Folly
was not built in release mode when Velox is built in release mode. This
commit adds -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" to FOLLY_FLAGS, so
a release Velox dependency setup now builds release Folly on macOS and
Linux.
---
 scripts/setup-common.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/setup-common.sh b/scripts/setup-common.sh
index 9699939d2af..6673e432a16 100755
--- a/scripts/setup-common.sh
+++ b/scripts/setup-common.sh
@@ -48,7 +48,12 @@ function install_fmt {
 
 function install_folly {
   wget_and_untar https://github.com/facebook/folly/archive/refs/tags/"${FB_OS_VERSION}".tar.gz folly
-  local FOLLY_FLAGS=(-DBUILD_SHARED_LIBS="$VELOX_BUILD_SHARED" -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON)
+  local FOLLY_FLAGS=(
+    -DBUILD_SHARED_LIBS="$VELOX_BUILD_SHARED"
+    -DBUILD_TESTS=OFF
+    -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}"
+    -DFOLLY_HAVE_INT128_T=ON
+  )
   # When folly is static, use static gflags to avoid dual gflags flag
   # registration when .so plugins are dlopen'd (both the binary and plugin
   # would register the same flags in a shared gflags registry).

From aabd0b116cdbe05709bca6510512a6b71f80ee94 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Sat, 9 May 2026 12:17:15 -0700
Subject: [PATCH 20/24] misc: Clean up OptimizedVectorHasherBenchmark.cpp

- Remove benchmarks with vector size 1000000 and only keep vector size
  10000.
- Add tinyint and smallint benchmarks.
---
 .../OptimizedVectorHasherBenchmark.cpp        | 78 +++++++------------
 1 file changed, 29 insertions(+), 49 deletions(-)

diff --git a/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
index 5dffdd7cbf3..32fdc278857 100644
--- a/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
+++ b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
@@ -207,28 +207,26 @@ void benchmarkOptimizedVectorHasher(
       iterations, nullMode, encodingMode, mix, size, dictionarySize);
 }
 
-#define REGISTER_HASHER_PAIR(                                                      \
-    T,                                                                             \
-    TYPE_NAME,                                                                     \
-    NULL_MODE,                                                                     \
-    NULL_NAME,                                                                     \
-    ENCODING_MODE,                                                                 \
-    ENCODING_NAME,                                                                 \
-    MIX,                                                                           \
-    MIX_NAME,                                                                      \
-    SIZE,                                                                          \
-    DICTIONARY_SIZE)                                                               \
-  BENCHMARK(                                                                       \
-      TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME##_##SIZE, n) {        \
-    benchmarkVectorHasher<T>(                                                      \
-        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);                  \
-  }                                                                                \
-  BENCHMARK_RELATIVE(                                                              \
-      optimized_##TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME##_##SIZE, \
-      n) {                                                                         \
-    benchmarkOptimizedVectorHasher<T>(                                             \
-        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);                  \
-  }                                                                                \
+#define REGISTER_HASHER_PAIR(                                                  \
+    T,                                                                         \
+    TYPE_NAME,                                                                 \
+    NULL_MODE,                                                                 \
+    NULL_NAME,                                                                 \
+    ENCODING_MODE,                                                             \
+    ENCODING_NAME,                                                             \
+    MIX,                                                                       \
+    MIX_NAME,                                                                  \
+    SIZE,                                                                      \
+    DICTIONARY_SIZE)                                                           \
+  BENCHMARK(TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME, n) {       \
+    benchmarkVectorHasher<T>(                                                  \
+        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);              \
+  }                                                                            \
+  BENCHMARK_RELATIVE(                                                          \
+      optimized_##TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME, n) { \
+    benchmarkOptimizedVectorHasher<T>(                                         \
+        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);              \
+  }                                                                            \
   BENCHMARK_DRAW_LINE();
 
 #define REGISTER_HASHER_NULL_MODES( \
@@ -298,23 +296,13 @@ void benchmarkOptimizedVectorHasher(
       SIZE,                                                                    \
       SIZE)
 
-#define REGISTER_HASHER_SIZES(                                                 \
-    T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME)                 \
-  REGISTER_HASHER_NULL_MODES(                                                  \
-      T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME, 10000, 10000) \
-  REGISTER_HASHER_NULL_MODES(                                                  \
-      T,                                                                       \
-      TYPE_NAME,                                                               \
-      ENCODING_MODE,                                                           \
-      ENCODING_NAME,                                                           \
-      MIX,                                                                     \
-      MIX_NAME,                                                                \
-      1000000,                                                                 \
-      1000000)
+#define REGISTER_HASHER_SIZES(                                 \
+    T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_NULL_MODES(                                  \
+      T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME, 10000, 10000)
 
-#define REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME)       \
-  REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 10000) \
-  REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 1000000)
+#define REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 10000)
 
 #define REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(         \
     T, TYPE_NAME, MIX, MIX_NAME, SIZE, PERCENT, PERCENT_NAME) \
@@ -338,17 +326,7 @@ void benchmarkOptimizedVectorHasher(
   REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
       T, TYPE_NAME, MIX, MIX_NAME, 10000, 20, 20pct)                  \
   REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
-      T, TYPE_NAME, MIX, MIX_NAME, 10000, 5, 5pct)                    \
-  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
-      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 80, 80pct)                \
-  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
-      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 60, 60pct)                \
-  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
-      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 40, 40pct)                \
-  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
-      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 20, 20pct)                \
-  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
-      T, TYPE_NAME, MIX, MIX_NAME, 1000000, 5, 5pct)
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 5, 5pct)
 
 #define REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, MIX, MIX_NAME)  \
   REGISTER_HASHER_SIZES(                                        \
@@ -361,6 +339,8 @@ void benchmarkOptimizedVectorHasher(
   REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, true, mix)
 
 REGISTER_HASHER_TYPE(bool, boolean)
+REGISTER_HASHER_TYPE(int8_t, tinyint)
+REGISTER_HASHER_TYPE(int16_t, smallint)
 REGISTER_HASHER_TYPE(int32_t, integer)
 REGISTER_HASHER_TYPE(int64_t, bigint)
 REGISTER_HASHER_TYPE(int128_t, hugeint)

From 86c1e73a3410a7f7d22f66a96d3fa835c655cedc Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Tue, 5 May 2026 15:49:07 +0100
Subject: [PATCH 21/24] feat(PartitionedOutput): Add BufferState to track
 bytesBuffered

---
 velox/exec/OptimizedPartitionedOutput.cpp     |   3 +-
 .../tests/OptimizedPartitionedOutputTest.cpp  | 131 +++++
 .../PrestoIterativePartitioningSerializer.cpp | 497 +++++++++++++-----
 .../PrestoIterativePartitioningSerializer.h   |  41 +-
 ...stoIterativePartitioningSerializerTest.cpp | 169 ++++++
 velox/vector/PartitionedVector.h              |   7 +
 6 files changed, 697 insertions(+), 151 deletions(-)

diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp
index e825ea94633..0ca9a957a8c 100644
--- a/velox/exec/OptimizedPartitionedOutput.cpp
+++ b/velox/exec/OptimizedPartitionedOutput.cpp
@@ -87,8 +87,7 @@ void OptimizedPartitionedOutput::addInput(RowVectorPtr input) {
       !replicateNullsAndAny_,
       "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput");
 
-  if (serializer_->bytesBuffered() + input->retainedSize() >=
-      maxOutputBufferBytes_) {
+  if (serializer_->estimateBytesAfterAppend(input) > maxOutputBufferBytes_) {
     flush();
   }
 
diff --git a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
index 4308aa77bfd..af9f272e062 100644
--- a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
+++ b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
@@ -16,10 +16,12 @@
 
 #include <future>
 #include <random>
+#include <string_view>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "velox/common/base/BitUtil.h"
 #include "velox/common/memory/ByteStream.h"
 #include "velox/exec/HashPartitionFunction.h"
 #include "velox/exec/OptimizedPartitionedOutput.h"
@@ -28,9 +30,28 @@
 #include "velox/exec/tests/utils/PlanBuilder.h"
 #include "velox/exec/tests/utils/QueryAssertions.h"
 #include "velox/serializers/PrestoSerializer.h"
+#include "velox/serializers/PrestoSerializerSerializationUtils.h"
 
 namespace facebook::velox::exec::test {
 
+namespace {
+
+int64_t simpleColumnPageBytes(
+    std::string_view encodingName,
+    int64_t numRows,
+    int64_t numNulls,
+    int64_t valueWidth) {
+  return serializer::presto::detail::kHeaderSize + // page header
+      4 + // numColumns
+      4 + static_cast<int64_t>(encodingName.size()) + // encoding header
+      4 + // rowCount
+      1 + // null flag
+      (numNulls > 0 ? bits::nbytes(numRows) : 0) + // null bitmap
+      (numRows - numNulls) * valueWidth; // values
+}
+
+} // namespace
+
 /// How null values are distributed in value columns.
 enum class NullMode {
   kNoNull, // no null values
@@ -758,6 +779,116 @@ INSTANTIATE_TEST_SUITE_P(
 
 // ─── non-parameterized tests ─────────────────────────────────────────────────
 
+// In single-partition case, if the second addInput() is estimated to stay
+// below the partitioned-output limit, it doesn't flush before appending.
+TEST_F(OptimizedPartitionedOutputTest, noPreFlushWhenEstimateBelowLimit) {
+  auto rowType = ROW({"v"}, {BIGINT()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}),
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({20})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-below-limit",
+      inputBatches,
+      {},
+      1,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(twoRowPageBytes + 1)}});
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 1);
+
+  auto expected = makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 20})});
+  auto actual = concatPages(result.pages[0], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// In single-partition case, if the second addInput() is estimated to land
+// exactly on the partitioned-output limit, it doesn't flush before appending.
+TEST_F(OptimizedPartitionedOutputTest, noPreFlushWhenEstimateAtLimit) {
+  auto rowType = ROW({"v"}, {BIGINT()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}),
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({20})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-equals-limit",
+      inputBatches,
+      {},
+      1,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(twoRowPageBytes)}});
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 1);
+
+  auto expected = makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 20})});
+  auto actual = concatPages(result.pages[0], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// In the single-partition case, if the second addInput() is estimated to
+// exceed the partitioned-output limit, addInput() flushes before appending.
+TEST_F(OptimizedPartitionedOutputTest, preFlushWhenEstimateExceedsLimit) {
+  auto rowType = ROW({"v"}, {BIGINT()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}),
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({20})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-exceeds-limit",
+      inputBatches,
+      {},
+      1,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(twoRowPageBytes - 1)}});
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 2);
+
+  auto expected = makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 20})});
+  auto actual = concatPages(result.pages[0], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// In multi-partition case, estimateBytesAfterAppend() may conservatively
+// assume an input could go to the last empty partition even when every row
+// actually goes to an existing partition, causing a pre-flush.
+TEST_F(
+    OptimizedPartitionedOutputTest,
+    preFlushWhenConservativeEstimateExceedsLimit) {
+  auto rowType = ROW({"p1"}, {INTEGER()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"p1"}, {makeFlatVector<int32_t>({5})}),
+      makeRowVector({"p1"}, {makeFlatVector<int32_t>({5})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("INT_ARRAY", 2, 0, 4);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-conservative-exceeds-limit",
+      inputBatches,
+      {"p1"},
+      2,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(
+            twoRowPageBytes)}}); // exact append fits; estimate does not
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 2);
+  EXPECT_EQ(result.numNonEmptyPartitions, 1);
+
+  EXPECT_THAT(result.pageCounts, testing::UnorderedElementsAre(2, 0));
+  EXPECT_THAT(result.rowCounts, testing::UnorderedElementsAre(2, 0));
+
+  const auto nonEmptyPartition = result.rowCounts[0] > 0 ? 0 : 1;
+
+  auto expected = makeRowVector({"p1"}, {makeFlatVector<int32_t>({5, 5})});
+  auto actual = concatPages(result.pages[nonEmptyPartition], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
 // Verifies that replicateNullsAndAny raises an error since it is not yet
 // supported by OptimizedPartitionedOutput.
 TEST_F(OptimizedPartitionedOutputTest, replicateNullsAndAnyUnsupported) {
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
index 20f893107f1..c7ccdbf652a 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.cpp
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
@@ -15,6 +15,9 @@
  */
 #include "velox/serializers/PrestoIterativePartitioningSerializer.h"
 
+#include <algorithm>
+#include <optional>
+
 #include "velox/common/base/BitUtil.h"
 #include "velox/type/Type.h"
 #include "velox/vector/ComplexVector.h"
@@ -133,32 +136,162 @@ simpleColumnBytes(const TypePtr& colType, int64_t numRows, int64_t numNulls) {
       (numRows - numNulls) * fixedTypeWidth(colType->kind()); // values
 }
 
-/// Returns per-partition exact byte counts for one column (all partitions).
-/// Recurses into nested ROW columns.
+/// Returns the null counts if it can be derived without row-by-row checks,
+/// otherwise returns std::nullopt.
+std::optional<vector_size_t> countNulls(const BaseVector& vector) {
+  if (!vector.mayHaveNulls()) {
+    return 0;
+  }
+
+  if (const auto nullCount = vector.getNullCount()) {
+    return *nullCount;
+  }
+
+  switch (vector.encoding()) {
+    case VectorEncoding::Simple::FLAT:
+    case VectorEncoding::Simple::ROW:
+      return BaseVector::countNulls(vector.nulls(), vector.size());
+    case VectorEncoding::Simple::CONSTANT:
+      return vector.isNullAt(0) ? vector.size() : 0;
+    case VectorEncoding::Simple::DICTIONARY: {
+      vector_size_t nullCount = 0;
+      for (auto i = 0; i < vector.size(); ++i) {
+        nullCount += vector.isNullAt(i);
+      }
+      return nullCount;
+    }
+    default:
+      return std::nullopt;
+  }
+}
+
+/// Returns the maximum null-bitmap bytes for totalRows distributed across
+/// numPartitionsWithNulls partitions. This occurs when one row is put in each
+/// partition first, then one byte is added for every 8 remaining rows.
+int64_t maxBitmapBytes(int64_t totalRows, int64_t numPartitionsWithNulls) {
+  if (numPartitionsWithNulls == 0) {
+    return 0;
+  }
+  VELOX_DCHECK_LE(numPartitionsWithNulls, totalRows);
+  return numPartitionsWithNulls + (totalRows - numPartitionsWithNulls) / 8;
+}
+
+/// Base class for column nodes in the serializer's per-partition accounting.
 ///
-/// Byte layout per column type:
-///   Fixed-width: simpleColumnBytes(colType, numRows, numNulls)
-///   ROW:         7 (header) + 4 (numFields)
-///                + sum(child sizes)
-///                + 4 (numRows) + 4*(numRows+1) (offsets) + 1 (hasNulls)
-///                + (rowNulls>0 ? bits::nbytes(numRows) : 0)
-std::vector<int64_t> computeColumnFlushSizes(
-    const std::vector<PartitionedVectorPtr>& columnVectors,
-    const TypePtr& colType,
-    const std::vector<uint32_t>& nonEmptyPartitions,
-    const std::vector<vector_size_t>& rowsPerPartition,
-    uint32_t numPartitions) {
-  std::vector<int64_t> sizes(numPartitions, 0);
+/// A node tracks exact row, null, and byte counts for one column while
+/// appending partitioned vectors.
+class ColumnBufferState {
+ public:
+  ColumnBufferState(TypePtr type, uint32_t numPartitions)
+      : type_(std::move(type)),
+        numPartitions_(numPartitions),
+        rowsPerPartition_(numPartitions, 0),
+        nullsPerPartition_(numPartitions, 0),
+        bytesPerPartition_(numPartitions, 0) {}
+
+  virtual ~ColumnBufferState() = default;
+
+  static std::unique_ptr<ColumnBufferState> create(
+      const TypePtr& type,
+      uint32_t numPartitions);
+
+  virtual void append(const PartitionedVectorPtr& partitionedVector) = 0;
+
+  virtual void clear() {
+    std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0);
+    std::fill(nullsPerPartition_.begin(), nullsPerPartition_.end(), 0);
+    std::fill(bytesPerPartition_.begin(), bytesPerPartition_.end(), 0);
+    numNonEmptyPartitions_ = 0;
+    numPartitionsWithNulls_ = 0;
+  }
 
-  // Compute per-partition null counts by summing across batches.
-  std::vector<int64_t> nullCounts(numPartitions, 0);
-  for (uint32_t p : nonEmptyPartitions) {
-    for (const auto& pv : columnVectors) {
-      nullCounts[p] += pv->numNullsAt(p);
+  const std::vector<vector_size_t>& rowsPerPartition() const {
+    return rowsPerPartition_;
+  }
+
+  const std::vector<int64_t>& bytesPerPartition() const {
+    return bytesPerPartition_;
+  }
+
+  uint32_t numNonEmptyPartitions() const {
+    return numNonEmptyPartitions_;
+  }
+
+  uint32_t numPartitionsWithNulls() const {
+    return numPartitionsWithNulls_;
+  }
+
+  int64_t nullBitmapBytesBuffered() const {
+    int64_t total = 0;
+    for (auto p = 0; p < numPartitions_; ++p) {
+      if (nullsPerPartition_[p] > 0) {
+        total += bits::nbytes(rowsPerPartition_[p]);
+      }
     }
+    return total;
   }
 
-  switch (colType->kind()) {
+ protected:
+  const TypePtr type_;
+  const uint32_t numPartitions_;
+  std::vector<vector_size_t> rowsPerPartition_;
+  std::vector<vector_size_t> nullsPerPartition_;
+  std::vector<int64_t> bytesPerPartition_;
+
+  // count of partitions with at least one buffered row
+  uint32_t numNonEmptyPartitions_{0};
+
+  // count of partitions that require a null bitmap
+  uint32_t numPartitionsWithNulls_{0};
+};
+
+/// Buffer state for one fixed-width column.
+class FixedWidthBufferState : public ColumnBufferState {
+ public:
+  FixedWidthBufferState(TypePtr type, uint32_t numPartitions)
+      : ColumnBufferState(std::move(type), numPartitions) {}
+
+  void append(const PartitionedVectorPtr& partitionedVector) override {
+    for (auto p = 0; p < numPartitions_; ++p) {
+      const auto numRows = partitionedVector->numRowsAt(p);
+      if (numRows == 0) {
+        continue;
+      }
+
+      const auto numNulls = partitionedVector->numNullsAt(p);
+      auto& rows = rowsPerPartition_[p];
+      auto& nulls = nullsPerPartition_[p];
+
+      if (rows == 0) {
+        ++numNonEmptyPartitions_;
+      }
+      if (nulls == 0 && numNulls > 0) {
+        ++numPartitionsWithNulls_;
+      }
+      rows += numRows;
+      nulls += numNulls;
+      bytesPerPartition_[p] = simpleColumnBytes(type_, rows, nulls);
+    }
+  }
+};
+
+/// Buffer state for one VARCHAR or VARBINARY column.
+class VariableWidthBufferState : public ColumnBufferState {
+ public:
+  VariableWidthBufferState(TypePtr type, uint32_t numPartitions)
+      : ColumnBufferState(std::move(type), numPartitions) {}
+
+  void append(const PartitionedVectorPtr& partitionedVector) override {
+    VELOX_NYI(
+        "Variable-width columns are not yet supported by "
+        "PrestoIterativePartitioningSerializer::append");
+  }
+};
+
+std::unique_ptr<ColumnBufferState> ColumnBufferState::create(
+    const TypePtr& type,
+    uint32_t numPartitions) {
+  switch (type->kind()) {
     case TypeKind::BOOLEAN:
     case TypeKind::TINYINT:
     case TypeKind::SMALLINT:
@@ -167,70 +300,130 @@ std::vector<int64_t> computeColumnFlushSizes(
     case TypeKind::REAL:
     case TypeKind::DOUBLE:
     case TypeKind::HUGEINT:
-      for (uint32_t p : nonEmptyPartitions) {
-        sizes[p] =
-            simpleColumnBytes(colType, rowsPerPartition[p], nullCounts[p]);
-      }
-      break;
-
-    case TypeKind::TIMESTAMP:
+      return std::make_unique<FixedWidthBufferState>(type, numPartitions);
     case TypeKind::VARCHAR:
     case TypeKind::VARBINARY:
+      return std::make_unique<VariableWidthBufferState>(type, numPartitions);
+    case TypeKind::TIMESTAMP:
+    case TypeKind::ROW:
     case TypeKind::ARRAY:
     case TypeKind::MAP:
       VELOX_NYI(
-          "computeColumnFlushSizes: unsupported type kind {}",
-          TypeKindName::toName(colType->kind()));
-
-    case TypeKind::ROW: {
-      const auto& rowSchema = colType->asRow();
-      const int32_t numFields = static_cast<int32_t>(rowSchema.size());
-
-      // Fixed per-partition overhead: header(7) + numFields(4) + footer:
-      // numRows(4)
-      // + sequential offsets 4*(numRows+1) + hasNulls(1)
-      // + null bitmap for the ROW vector itself if any rows in this partition
-      // are null.
-      for (uint32_t p : nonEmptyPartitions) {
-        const int64_t numRows = rowsPerPartition[p];
-        const int64_t rowNullBitmapBytes =
-            nullCounts[p] > 0 ? bits::nbytes(numRows) : 0;
-        sizes[p] = 7 + 4 + // "ROW" header + numFields
-            4 + 4 * (numRows + 1) + 1 + // footer: numRows + offsets + hasNulls
-            rowNullBitmapBytes;
-      }
-      // Add child column sizes recursively.
-      for (uint32_t col = 0; col < static_cast<uint32_t>(numFields); ++col) {
-        std::vector<PartitionedVectorPtr> childVectors;
-        childVectors.reserve(columnVectors.size());
-        for (const auto& pv : columnVectors) {
-          childVectors.push_back(
-              std::dynamic_pointer_cast<PartitionedRowVector>(pv)->childAt(
-                  col));
-        }
-        const auto childSizes = computeColumnFlushSizes(
-            childVectors,
-            rowSchema.childAt(col),
-            nonEmptyPartitions,
-            rowsPerPartition,
-            numPartitions);
-        for (uint32_t p : nonEmptyPartitions) {
-          sizes[p] += childSizes[p];
-        }
-      }
-      break;
-    }
-
+          "Unsupported type kind for createColumnBufferState: {}",
+          type->kind());
     default:
       VELOX_UNSUPPORTED(
-          "computeColumnFlushSizes: unsupported type kind {}",
-          TypeKindName::toName(colType->kind()));
+          "Unsupported type kind for createColumnBufferState: {}", type->kind());
   }
-  return sizes;
 }
 
 } // namespace
 
+/// Top-level buffer state for one output page.
+///
+/// For each partition, tracks page-level headers and aggregates child column
+/// sizes.
+class BufferState {
+ public:
+  BufferState(
+      uint32_t numPartitions,
+      std::vector<std::unique_ptr<ColumnBufferState>> children)
+      : numPartitions_(numPartitions),
+        rowsPerPartition_(numPartitions, 0),
+        bytesPerPartition_(numPartitions, 0),
+        children_(std::move(children)) {}
+
+  static std::unique_ptr<BufferState> create(
+      const RowTypePtr& type,
+      uint32_t numPartitions);
+
+  void append(const PartitionedVectorPtr& partitionedVector) {
+    auto rowVector =
+        std::dynamic_pointer_cast<PartitionedRowVector>(partitionedVector);
+    VELOX_CHECK_NOT_NULL(rowVector);
+
+    rowsBuffered_ += partitionedVector->baseVector()->size();
+
+    for (auto column = 0; column < children_.size(); ++column) {
+      children_[column]->append(rowVector->childAt(column));
+    }
+
+    for (auto p = 0; p < numPartitions_; ++p) {
+      const auto numRows = partitionedVector->numRowsAt(p);
+      if (numRows == 0) {
+        continue;
+      }
+      if (rowsPerPartition_[p] == 0) {
+        ++numNonEmptyPartitions_;
+      }
+      rowsPerPartition_[p] += numRows;
+
+      int64_t partitionBytes = kHeaderSize + 4;
+      for (const auto& child : children_) {
+        partitionBytes += child->bytesPerPartition()[p];
+      }
+      bytesBuffered_ += partitionBytes - bytesPerPartition_[p];
+      bytesPerPartition_[p] = partitionBytes;
+    }
+  }
+
+  void clear() {
+    std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0);
+    std::fill(bytesPerPartition_.begin(), bytesPerPartition_.end(), 0);
+    numNonEmptyPartitions_ = 0;
+    rowsBuffered_ = 0;
+    bytesBuffered_ = 0;
+    for (auto& child : children_) {
+      child->clear();
+    }
+  }
+
+  const std::vector<vector_size_t>& rowsPerPartition() const {
+    return rowsPerPartition_;
+  }
+
+  const std::vector<int64_t>& bytesPerPartition() const {
+    return bytesPerPartition_;
+  }
+
+  uint32_t numNonEmptyPartitions() const {
+    return numNonEmptyPartitions_;
+  }
+
+  vector_size_t rowsBuffered() const {
+    return rowsBuffered_;
+  }
+
+  int64_t bytesBuffered() const {
+    return bytesBuffered_;
+  }
+
+  const std::vector<std::unique_ptr<ColumnBufferState>>& children() const {
+    return children_;
+  }
+
+ private:
+  const uint32_t numPartitions_;
+  std::vector<vector_size_t> rowsPerPartition_;
+  std::vector<int64_t> bytesPerPartition_;
+  uint32_t numNonEmptyPartitions_{0};
+  vector_size_t rowsBuffered_{0};
+  int64_t bytesBuffered_{0};
+  std::vector<std::unique_ptr<ColumnBufferState>> children_;
+};
+
+std::unique_ptr<BufferState> BufferState::create(
+    const RowTypePtr& type,
+    uint32_t numPartitions) {
+  std::vector<std::unique_ptr<ColumnBufferState>> children;
+  children.reserve(type->size());
+  for (auto column = 0; column < type->size(); ++column) {
+    children.push_back(
+        ColumnBufferState::create(type->childAt(column), numPartitions));
+  }
+  return std::make_unique<BufferState>(numPartitions, std::move(children));
+}
+
 PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer(
     RowTypePtr inputType,
     uint32_t numPartitions,
@@ -242,11 +435,90 @@ PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer(
       opts_(opts),
       pool_(pool),
       listenerFactory_(std::move(listenerFactory)),
-      rowsPerPartition_(numPartitions, 0) {
+      numColumns_(type_->size()),
+      bufferState_(BufferState::create(type_, numPartitions_)) {
   VELOX_CHECK_GT(numPartitions_, 0);
   VELOX_CHECK_NOT_NULL(pool_);
+}
 
-  numColumns_ = type_->size();
+PrestoIterativePartitioningSerializer::
+    ~PrestoIterativePartitioningSerializer() = default;
+
+int64_t PrestoIterativePartitioningSerializer::bytesBuffered() const {
+  return bufferState_->bytesBuffered();
+}
+
+vector_size_t PrestoIterativePartitioningSerializer::rowsBuffered() const {
+  return bufferState_->rowsBuffered();
+}
+
+void PrestoIterativePartitioningSerializer::clear() {
+  partitionedRowVectors_.clear();
+  bufferState_->clear();
+}
+
+int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend(
+    const RowVectorPtr& input) const {
+  VELOX_CHECK_NOT_NULL(input);
+
+  if (input->size() == 0) {
+    return bytesBuffered();
+  }
+
+  const auto numRows = input->size();
+
+  // Worst case: each input row lands in a distinct empty partition, capped by
+  // the number of empty partitions.
+  const auto numNewPartitions = std::min<uint32_t>(
+      numRows, numPartitions_ - bufferState_->numNonEmptyPartitions());
+  // One page header per newly non-empty partition.
+  auto estimatedBytes =
+      bufferState_->bytesBuffered() + numNewPartitions * (kHeaderSize + 4);
+
+  for (auto column = 0; column < numColumns_; ++column) {
+    const auto& columnType = type_->childAt(column);
+    if (columnType->isUnknown()) {
+      VELOX_UNSUPPORTED(
+          "Unsupported type kind for "
+          "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}",
+          columnType->kind());
+    } else if (columnType->isFixedWidth()) {
+      const auto* columnState = bufferState_->children()[column].get();
+      const auto inputNulls = countNulls(*input->childAt(column));
+      const auto partitionsWithNulls = std::min<uint32_t>(
+          bufferState_->numNonEmptyPartitions() + numNewPartitions,
+          columnState->numPartitionsWithNulls() + inputNulls.value_or(numRows));
+      const auto nullBitmapBytes = maxBitmapBytes(
+          bufferState_->rowsBuffered() + numRows, partitionsWithNulls);
+      auto nullBitmapBytesBuffered = columnState->nullBitmapBytesBuffered();
+      VELOX_DCHECK_GE(nullBitmapBytes, nullBitmapBytesBuffered);
+
+      estimatedBytes += numNewPartitions *
+              simpleColumnBytes(columnType, 0, 0) + // header growth
+          nullBitmapBytes -
+          nullBitmapBytesBuffered + // null bitmap growth
+          static_cast<int64_t>(numRows - inputNulls.value_or(0)) *
+              fixedTypeWidth(columnType->kind()); // value bytes growth
+    } else {
+      switch (columnType->kind()) {
+        case TypeKind::VARCHAR:
+        case TypeKind::VARBINARY:
+        case TypeKind::ROW:
+        case TypeKind::ARRAY:
+        case TypeKind::MAP:
+          VELOX_NYI(
+              "Unsupported type kind for "
+              "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}",
+              columnType->kind());
+        default:
+          VELOX_UNSUPPORTED(
+              "Unsupported type kind for "
+              "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}",
+              columnType->kind());
+      }
+    }
+  }
+  return estimatedBytes;
 }
 
 void PrestoIterativePartitioningSerializer::append(
@@ -270,18 +542,8 @@ void PrestoIterativePartitioningSerializer::append(
       ctx,
       pool_);
 
-  const vector_size_t* partitionOffsets =
-      partitionedRowVector->rawPartitionOffsets();
-  vector_size_t prevOffset = 0;
-  for (uint32_t p = 0; p < numPartitions_; ++p) {
-    rowsPerPartition_[p] += partitionOffsets[p] - prevOffset;
-    prevOffset = partitionOffsets[p];
-  }
-
+  bufferState_->append(partitionedRowVector);
   partitionedRowVectors_.push_back(std::move(partitionedRowVector));
-
-  bytesBuffered_ += input->retainedSize();
-  rowsBuffered_ += static_cast<int64_t>(input->size());
 }
 
 // ---------------------------------------------------------------------------
@@ -295,11 +557,7 @@ PrestoIterativePartitioningSerializer::flush() {
       ? flushUncompressed()
       : flushCompressed();
 
-  partitionedRowVectors_.clear();
-  flushSizes_.clear();
-  std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0);
-  bytesBuffered_ = 0;
-  rowsBuffered_ = 0;
+  clear();
 
   return pages;
 }
@@ -313,31 +571,13 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
   // 1. Determine non-empty partitions.
   std::vector<uint32_t> nonEmptyPartitions;
   for (uint32_t p = 0; p < numPartitions_; ++p) {
-    if (rowsPerPartition_[p] > 0) {
+    if (bufferState_->rowsPerPartition()[p] > 0) {
       nonEmptyPartitions.push_back(p);
     }
   }
-
-  // 2. Pre-compute exact byte sizes per top-level column and partition.
   const auto& rowSchema = type_->asRow();
-  flushSizes_.assign(rowSchema.size(), std::vector<int64_t>(numPartitions_, 0));
-  for (uint32_t col = 0; col < rowSchema.size(); ++col) {
-    std::vector<PartitionedVectorPtr> columnVectors;
-    columnVectors.reserve(partitionedRowVectors_.size());
-    for (const auto& pRowVector : partitionedRowVectors_) {
-      columnVectors.push_back(
-          std::dynamic_pointer_cast<PartitionedRowVector>(pRowVector)
-              ->childAt(col));
-    }
-    flushSizes_[col] = computeColumnFlushSizes(
-        columnVectors,
-        rowSchema.childAt(col),
-        nonEmptyPartitions,
-        rowsPerPartition_,
-        numPartitions_);
-  }
 
-  // 3. Create per-partition listeners first so the codec mask can be derived
+  // 2. Create per-partition listeners first so the codec mask can be derived
   // from whether the factory actually produced a listener. The factory may
   // return nullptr (e.g. when OutputBufferManager has no listener factory
   // set), in which case checksumming is skipped and the checksum bit must not
@@ -352,30 +592,27 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
       listeners[nonEmptyPartitions[0]] != nullptr;
   const char codecMask = getCodecMarker(checksumEnabled);
 
-  // 4. Create output streams sized to the exact bytes each partition will need,
+  // 3. Create output streams sized to the exact bytes each partition will need,
   // so that the entire payload fits. This avoids multiple resizing and copying.
   std::vector<std::unique_ptr<IOBufOutputStream>> outputStreams(numPartitions_);
   std::vector<IOBufOutputStream*> rawOutputStreams(numPartitions_);
   std::vector<std::streampos> beginStreamPositions(numPartitions_);
 
   for (uint32_t p : nonEmptyPartitions) {
-    int64_t initialSize = kHeaderSize + 4; // page header + numCols
-    for (uint32_t col = 0; col < rowSchema.size(); ++col) {
-      initialSize += flushSizes_[col][p];
-    }
+    listeners[p] = std::make_unique<PrestoOutputStreamListener>();
     outputStreams[p] = std::make_unique<IOBufOutputStream>(
-        *pool_, listeners[p].get(), initialSize);
+        *pool_, listeners[p].get(), bufferState_->bytesPerPartition()[p]);
     rawOutputStreams[p] = outputStreams[p].get();
     beginStreamPositions[p] = outputStreams[p]->tellp();
 
     flushStart(*outputStreams[p], p, codecMask);
   }
 
-  // 5. Flush column data.
+  // 4. Flush column data.
   flushRowChildren(
       partitionedRowVectors_, rowSchema, nonEmptyPartitions, rawOutputStreams);
 
-  // 6. Finalize the page by seeking back to fill in sizes and CRC, and get the
+  // 5. Finalize the page by seeking back to fill in sizes and CRC, and get the
   // IOBuf and numOfRows from each stream.
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
       result;
@@ -386,8 +623,8 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
         beginStreamPositions[p],
         codecMask,
         listeners[p].get());
-    result[p] =
-        std::make_pair(outputStreams[p]->getIOBuf(), rowsPerPartition_[p]);
+    result[p] = std::make_pair(
+        outputStreams[p]->getIOBuf(), bufferState_->rowsPerPartition()[p]);
   }
 
   return result;
@@ -412,7 +649,8 @@ void PrestoIterativePartitioningSerializer::flushStart(
   }
 
   // Write 21-byte Presto page header; sizes and CRC are filled in later.
-  const int32_t numRows = static_cast<int32_t>(rowsPerPartition_[partition]);
+  const int32_t numRows =
+      static_cast<int32_t>(bufferState_->rowsPerPartition()[partition]);
   char header[kHeaderSize] = {};
   std::memcpy(&header[0], &numRows, 4);
   std::memcpy(&header[4], &codecMask, 1);
@@ -466,7 +704,7 @@ void PrestoIterativePartitioningSerializer::flushFinish(
     crc = computeChecksum(
         *prestoListener,
         static_cast<int8_t>(codecMask),
-        static_cast<int32_t>(rowsPerPartition_[partition]),
+        static_cast<int32_t>(bufferState_->rowsPerPartition()[partition]),
         uncompressedSize);
   }
 
@@ -694,7 +932,9 @@ void PrestoIterativePartitioningSerializer::flushRowCounts(
     const std::vector<uint32_t>& nonEmptyPartitions,
     const std::vector<IOBufOutputStream*>& outputStreams) const {
   for (uint32_t p : nonEmptyPartitions) {
-    writeInt32(outputStreams[p], static_cast<int32_t>(rowsPerPartition_[p]));
+    writeInt32(
+        outputStreams[p],
+        static_cast<int32_t>(bufferState_->rowsPerPartition()[p]));
   }
 }
 
@@ -726,7 +966,9 @@ void PrestoIterativePartitioningSerializer::flushNulls(
   std::vector<std::vector<uint8_t>> bitmaps(numPartitions_);
   for (uint32_t p : nonEmptyPartitions) {
     if (nullCounts[p] > 0) {
-      bitmaps[p].assign(bits::nbytes(rowsPerPartition_[p]), bits::kNotNullByte);
+      bitmaps[p].assign(
+          bits::nbytes(bufferState_->rowsPerPartition()[p]),
+          bits::kNotNullByte);
     }
   }
 
@@ -761,7 +1003,7 @@ void PrestoIterativePartitioningSerializer::flushNulls(
 
     // Convert Velox format (LSB-first, 1=not-null) to Presto wire format
     // (MSB-first, 1=null) in-place.
-    const int32_t numBytes = bits::nbytes(rowsPerPartition_[p]);
+    const int32_t numBytes = bits::nbytes(bufferState_->rowsPerPartition()[p]);
     for (int32_t i = 0; i < numBytes; ++i) {
       bitmaps[p][i] = ~bitmaps[p][i];
       bits::reverseBits(&bitmaps[p][i], 1);
@@ -857,7 +1099,8 @@ void PrestoIterativePartitioningSerializer::flushSequentialOffsets(
     const std::vector<uint32_t>& nonEmptyPartitions,
     const std::vector<IOBufOutputStream*>& outputStreams) const {
   for (uint32_t p : nonEmptyPartitions) {
-    const int32_t numRows = static_cast<int32_t>(rowsPerPartition_[p]);
+    const int32_t numRows =
+        static_cast<int32_t>(bufferState_->rowsPerPartition()[p]);
     for (int32_t i = 0; i <= numRows; ++i) {
       writeInt32(outputStreams[p], i);
     }
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h
index f0ea802fe9e..88abeb49e5a 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.h
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.h
@@ -32,6 +32,8 @@ namespace facebook::velox::serializer::presto {
 /// Convenience alias matching PrestoSerializer.cpp convention.
 using SerdeOpts = PrestoVectorSerde::PrestoOptions;
 
+class BufferState;
+
 /// Serializes a stream of RowVectors into per-partition Presto pages.
 ///
 /// Each call to append() routes rows to their assigned partition. flush()
@@ -54,6 +56,14 @@ class PrestoIterativePartitioningSerializer {
       std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory =
           nullptr);
 
+  ~PrestoIterativePartitioningSerializer();
+
+  /// Returns a conservative estimate of bytesBuffered() after appending
+  /// `input`. The partition assignment of the input is not known at the time of
+  /// the call, so this assumes worst-case growth from new non-empty partitions
+  /// and may overestimate.
+  int64_t estimateBytesAfterAppend(const RowVectorPtr& input) const;
+
   /// Routes each row in `input` to the partition indicated by `partitions`.
   /// `partitions.size()` must equal `input->size()`.
   void append(
@@ -66,22 +76,12 @@ class PrestoIterativePartitioningSerializer {
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
   flush();
 
-  /// Returns the total retained bytes of all appended input vectors.
-  int64_t bytesBuffered() const {
-    return bytesBuffered_;
-  }
+  /// Returns the serialized bytes buffered across all partitions since the last
+  /// flush.
+  int64_t bytesBuffered() const;
 
   /// Returns the total number of rows appended since the last flush.
-  int64_t rowsBuffered() const {
-    return rowsBuffered_;
-  }
-
-  /// Returns the number of rows buffered for the given partition.
-  /// Must be called before flush(), which resets per-partition counts.
-  int64_t rowsPerPartition(uint32_t partition) const {
-    VELOX_DCHECK_LT(partition, numPartitions_);
-    return rowsPerPartition_[partition];
-  }
+  vector_size_t rowsBuffered() const;
 
  private:
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
@@ -89,6 +89,8 @@ class PrestoIterativePartitioningSerializer {
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
   flushCompressed();
 
+  void clear();
+
   void flushStart(IOBufOutputStream& out, uint32_t partition, char codecMask)
       const;
 
@@ -174,19 +176,14 @@ class PrestoIterativePartitioningSerializer {
   memory::MemoryPool* pool_;
   std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory_;
 
-  /// Cumulative row count per partition across all appended batches.
-  std::vector<vector_size_t> rowsPerPartition_;
-
   /// Number of top-level columns in `type_`.
   uint32_t numColumns_{0};
 
   std::vector<PartitionedVectorPtr> partitionedRowVectors_;
 
-  int64_t bytesBuffered_{0};
-  int64_t rowsBuffered_{0};
-
-  /// Per-column, per-partition exact byte counts computed during flush.
-  std::vector<std::vector<int64_t>> flushSizes_;
+  /// Accumulated state for all batches buffered since the last
+  /// flush.
+  std::unique_ptr<BufferState> bufferState_;
 };
 
 } // namespace facebook::velox::serializer::presto
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
index dcea3d08b7d..87a81ff996a 100644
--- a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -15,17 +15,37 @@
  */
 
 #include <random>
+#include <string_view>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "velox/common/base/BitUtil.h"
 #include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+
+#include "velox/serializers/PrestoSerializerSerializationUtils.h"
 #include "velox/vector/tests/utils/VectorTestBase.h"
 
 using namespace facebook::velox;
 using namespace facebook::velox::serializer::presto;
 using namespace facebook::velox::test;
 
+namespace {
+
+int64_t simpleColumnPageBytes(
+    std::string_view encodingName,
+    int64_t numRows,
+    int64_t numNulls,
+    int64_t valueWidth) {
+  return serializer::presto::detail::kHeaderSize + 4 // page header + num cols
+      + 4 + static_cast<int64_t>(encodingName.size()) // column header
+      + 4 // num rows
+      + 1 + (numNulls > 0 ? bits::nbytes(numRows) : 0) // null flags
+      + (numRows - numNulls) * valueWidth; // values
+}
+
+} // namespace
+
 // ---------------------------------------------------------------------------
 // Shared base fixture
 // ---------------------------------------------------------------------------
@@ -126,6 +146,18 @@ class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase {
     return value;
   }
 
+  int64_t totalFlushedBytes(
+      std::map<
+          uint32_t,
+          std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>& pages)
+      const {
+    int64_t totalBytes = 0;
+    for (const auto& [_, page] : pages) {
+      totalBytes += page.first->computeChainDataLength();
+    }
+    return totalBytes;
+  }
+
   PrestoVectorSerde serde_;
 };
 
@@ -491,9 +523,11 @@ TEST_F(PrestoIterativePartitioningSerializerTest, multipleAppends) {
       {2, 0, 1});
 
   EXPECT_EQ(serializer->rowsBuffered(), 6);
+  const auto bufferedBytes = serializer->bytesBuffered();
 
   auto ioBufs = serializer->flush();
   ASSERT_EQ(ioBufs.size(), 3);
+  EXPECT_EQ(bufferedBytes, totalFlushedBytes(ioBufs));
 
   auto r0 = deserialize(*ioBufs.at(0).first, type);
   auto r1 = deserialize(*ioBufs.at(1).first, type);
@@ -508,6 +542,141 @@ TEST_F(PrestoIterativePartitioningSerializerTest, multipleAppends) {
   EXPECT_EQ(sortedValues<int64_t>(r2, 0), (std::vector<int64_t>{300, 400}));
 }
 
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    bytesBufferedPartitionGrowth) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  const auto singleRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 1, 0, 8);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}), {0});
+  EXPECT_EQ(serializer->bytesBuffered(), singleRowPageBytes);
+
+  auto input = makeRowVector({"v"}, {makeFlatVector<int64_t>({20})});
+  EXPECT_EQ(serializer->bytesBuffered(), singleRowPageBytes);
+
+  serializer->append(input, {1});
+  const auto bytesBuffered = serializer->bytesBuffered();
+  EXPECT_EQ(serializer->bytesBuffered(), 2 * singleRowPageBytes);
+
+  auto ioBufs = serializer->flush();
+  EXPECT_EQ(serializer->bytesBuffered(), 0);
+  EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs));
+}
+
+TEST_F(PrestoIterativePartitioningSerializerTest, bytesBufferedNullFlagGrowth) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6, 7, 8})}),
+      std::vector<uint32_t>(8, 0));
+  EXPECT_EQ(
+      serializer->bytesBuffered(),
+      simpleColumnPageBytes("LONG_ARRAY", 8, 0, 8));
+
+  auto input =
+      makeRowVector({"v"}, {makeNullableFlatVector<int64_t>({std::nullopt})});
+  EXPECT_EQ(
+      serializer->bytesBuffered(),
+      simpleColumnPageBytes("LONG_ARRAY", 8, 0, 8));
+
+  serializer->append(input, {0});
+  const auto bytesBuffered = serializer->bytesBuffered();
+  EXPECT_EQ(bytesBuffered, simpleColumnPageBytes("LONG_ARRAY", 9, 1, 8));
+
+  auto ioBufs = serializer->flush();
+  EXPECT_EQ(serializer->bytesBuffered(), 0);
+  EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs));
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendExactForSinglePartition) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6, 7, 8})}),
+      std::vector<uint32_t>(8, 0));
+
+  auto input =
+      makeRowVector({"v"}, {makeNullableFlatVector<int64_t>({std::nullopt})});
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, {0});
+  EXPECT_EQ(estimatedAfter, serializer->bytesBuffered());
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendExactForConstant) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4})}),
+      std::vector<uint32_t>(4, 0));
+
+  auto input = makeRowVector({"v"}, {makeConstant<int64_t>(7, 2)});
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, std::vector<uint32_t>(2, 0));
+  EXPECT_EQ(estimatedAfter, serializer->bytesBuffered());
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendExactForNullConstant) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6, 7, 8})}),
+      std::vector<uint32_t>(8, 0));
+
+  auto input = makeRowVector({"v"}, {makeConstant<int64_t>(std::nullopt, 80)});
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, std::vector<uint32_t>(80, 0));
+  EXPECT_EQ(estimatedAfter, serializer->bytesBuffered());
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendOverestimatesPartitionedAppend) {
+  auto type = ROW({"a", "b"}, {BIGINT(), INTEGER()});
+  auto serializer = makeSerializer(type, 3);
+
+  serializer->append(
+      makeRowVector(
+          {"a", "b"},
+          {
+              makeFlatVector<int64_t>({10, 20}),
+              makeFlatVector<int32_t>({100, 200}),
+          }),
+      {0, 1});
+
+  auto input = makeRowVector(
+      {"a", "b"},
+      {
+          makeNullableFlatVector<int64_t>({30, std::nullopt, 50, 60}),
+          makeNullableFlatVector<int32_t>({300, 400, std::nullopt, 600}),
+      });
+
+  // All rows land in an already non-empty partition, but
+  // estimateBytesAfterAppend still assume this input could go to the last empty
+  // partition before the real distribution is known.
+  const std::vector<uint32_t> partitions{1, 1, 1, 1};
+
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, partitions);
+  EXPECT_GT(estimatedAfter, serializer->bytesBuffered());
+}
+
 // Flush twice: second flush on empty state returns an empty map.
 TEST_F(PrestoIterativePartitioningSerializerTest, flushTwice) {
   auto type = ROW({"a"}, {BIGINT()});
diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h
index eb008f1193b..24dec3f03fb 100644
--- a/velox/vector/PartitionedVector.h
+++ b/velox/vector/PartitionedVector.h
@@ -155,6 +155,13 @@ class PartitionedVector {
     return numNullsPerPartition_[partition];
   }
 
+  vector_size_t numRowsAt(uint32_t partition) const {
+    auto beginOffset =
+        partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+    auto endOffset = rawEndPartitionOffsets_[partition];
+    return endOffset - beginOffset;
+  }
+
   TypeKind typeKind() const {
     return vector_->typeKind();
   }

From 38254ae5b08f932ec77c375573c2e09afcf8bc0f Mon Sep 17 00:00:00 2001
From: Xin Zhang <desertsxin@gmail.com>
Date: Tue, 28 Apr 2026 11:32:31 -0700
Subject: [PATCH 22/24] feat(PartitionedOutput): Add outputChannels support

---
 velox/exec/OptimizedPartitionedOutput.cpp     |  80 +++++++-
 velox/exec/OptimizedPartitionedOutput.h       |  24 ++-
 .../tests/OptimizedPartitionedOutputTest.cpp  | 177 ++++++++++++++----
 .../PrestoIterativePartitioningSerializer.cpp |  88 +++++++--
 .../PrestoIterativePartitioningSerializer.h   |  48 ++++-
 ...stoIterativePartitioningSerializerTest.cpp |  64 +++++++
 6 files changed, 423 insertions(+), 58 deletions(-)

diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp
index 0ca9a957a8c..d9983a18cbd 100644
--- a/velox/exec/OptimizedPartitionedOutput.cpp
+++ b/velox/exec/OptimizedPartitionedOutput.cpp
@@ -16,6 +16,8 @@
 
 #include "velox/exec/OptimizedPartitionedOutput.h"
 
+#include <unordered_map>
+
 #include "velox/exec/HashPartitionFunction.h"
 #include "velox/exec/SerializedPage.h"
 #include "velox/exec/Task.h"
@@ -67,12 +69,15 @@ OptimizedPartitionedOutput::OptimizedPartitionedOutput(
       operatorCtx_->driverCtx()->queryConfig().shuffleCompressionKind());
   options.minCompressionRatio = 0.8;
 
+  initializeSerializerLayout();
+
   serializer_ = std::make_unique<
       serializer::presto::PrestoIterativePartitioningSerializer>(
-      inputType_,
+      outputType_,
       numDestinations_,
       options,
       pool_,
+      serializerInputByOutput_,
       [bufferManager =
            bufferManager_]() -> std::unique_ptr<OutputStreamListener> {
         auto lockedBufferManager = bufferManager.lock();
@@ -87,7 +92,10 @@ void OptimizedPartitionedOutput::addInput(RowVectorPtr input) {
       !replicateNullsAndAny_,
       "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput");
 
-  if (serializer_->estimateBytesAfterAppend(input) > maxOutputBufferBytes_) {
+  auto serializerInput = prepareSerializerInput(input);
+
+  if (serializer_->estimateBytesAfterAppend(serializerInput) >
+      maxOutputBufferBytes_) {
     flush();
   }
 
@@ -105,7 +113,7 @@ void OptimizedPartitionedOutput::addInput(RowVectorPtr input) {
     }
   }
 
-  serializer_->append(input, partitions_);
+  serializer_->append(serializerInput, partitions_);
 
   auto lockedStats = stats_.wlock();
   ++numAppends_;
@@ -157,6 +165,72 @@ bool OptimizedPartitionedOutput::isFinished() {
   return finished_;
 }
 
+void OptimizedPartitionedOutput::initializeSerializerLayout() {
+  if (outputType_->size() == 0 || outputChannels_.empty()) {
+    serializerInputType_ = outputType_;
+    return;
+  }
+
+  std::unordered_map<column_index_t, column_index_t> outputToSerializerInput;
+  outputToSerializerInput.reserve(outputChannels_.size());
+
+  std::vector<std::string> names;
+  std::vector<TypePtr> types;
+  names.reserve(outputChannels_.size());
+  types.reserve(outputChannels_.size());
+  serializerInputByOutput_.reserve(outputChannels_.size());
+
+  for (const auto outputChannel : outputChannels_) {
+    auto it = outputToSerializerInput.find(outputChannel);
+    if (it == outputToSerializerInput.end()) {
+      const auto serializerInputChannel =
+          static_cast<column_index_t>(serializerInputChannels_.size());
+      serializerInputChannels_.push_back(outputChannel);
+      names.push_back(inputType_->nameOf(outputChannel));
+      types.push_back(inputType_->childAt(outputChannel));
+      it =
+          outputToSerializerInput.emplace(outputChannel, serializerInputChannel)
+              .first;
+    }
+    serializerInputByOutput_.push_back(it->second);
+  }
+
+  serializerInputType_ = ROW(std::move(names), std::move(types));
+}
+
+RowVectorPtr OptimizedPartitionedOutput::prepareSerializerInput(
+    const RowVectorPtr& input) const {
+  VELOX_CHECK_NOT_NULL(input);
+
+  if (serializerInputType_->size() == 0) {
+    return std::make_shared<RowVector>(
+        input->pool(),
+        serializerInputType_,
+        nullptr /*nulls*/,
+        input->size(),
+        std::vector<VectorPtr>{});
+  }
+
+  if (serializerInputChannels_.empty()) {
+    input->loadedVector();
+    return input;
+  }
+
+  std::vector<VectorPtr> serializerInputColumns;
+  serializerInputColumns.reserve(serializerInputChannels_.size());
+  for (auto channel : serializerInputChannels_) {
+    auto loadedChild = BaseVector::loadedVectorShared(input->childAt(channel));
+    serializerInputColumns.push_back(loadedChild);
+  }
+
+  return std::make_shared<RowVector>(
+      input->pool(),
+      serializerInputType_,
+      nullptr /*nulls*/,
+      input->size(),
+      std::move(serializerInputColumns));
+}
+
 void OptimizedPartitionedOutput::flush() {
   const auto flushedBytes = serializer_->bytesBuffered();
   const auto flushedRows = serializer_->rowsBuffered();
diff --git a/velox/exec/OptimizedPartitionedOutput.h b/velox/exec/OptimizedPartitionedOutput.h
index 0f9dd2e2b47..78ddcaf4a6f 100644
--- a/velox/exec/OptimizedPartitionedOutput.h
+++ b/velox/exec/OptimizedPartitionedOutput.h
@@ -55,6 +55,15 @@ class OptimizedPartitionedOutput : public Operator {
   bool isFinished() override;
 
  private:
+  /// Computes the serializer input columns and the mapping from output columns
+  /// to serializer input columns.
+  void initializeSerializerLayout();
+
+  /// Builds the RowVector consumed by the serializer. When the output layout
+  /// has duplicated columns, this projects only the distinct columns and
+  /// leaves duplication to flush time.
+  RowVectorPtr prepareSerializerInput(const RowVectorPtr& input) const;
+
   /// Serializes all buffered rows into Presto pages and enqueues each page
   /// into the output buffer manager. All destinations are always enqueued;
   /// sets blockingReason_ and records a future if the output buffer is full.
@@ -62,11 +71,9 @@ class OptimizedPartitionedOutput : public Operator {
   void flush();
 
   const std::string taskId_;
-  /// Input row type; also used as output type (column reordering not yet
-  /// applied).
   const RowTypePtr inputType_;
   const std::vector<column_index_t> keyChannels_;
-  /// Non-empty when the output column order differs from the input.
+  /// Non-empty when the output layout differs from the input
   const std::vector<column_index_t> outputChannels_;
   const int32_t numDestinations_;
 
@@ -78,12 +85,23 @@ class OptimizedPartitionedOutput : public Operator {
   const int64_t maxOutputBufferBytes_;
 
   velox::memory::MemoryPool* pool_;
+
   /// Computes per-row partition assignments. Null when numDestinations_ == 1.
   std::unique_ptr<core::PartitionFunction> partitionFunction_;
   /// Reusable buffer for per-row partition assignments.
   std::vector<uint32_t> partitions_;
+
   std::unique_ptr<serializer::presto::PrestoIterativePartitioningSerializer>
       serializer_;
+  /// Row type passed to serializer_->append(). It only includes distinct
+  /// columns from the output layout.
+  RowTypePtr serializerInputType_;
+  /// Input channels that make up the serializer input type. Empty if the output
+  /// layout is the same as the input.
+  std::vector<column_index_t> serializerInputChannels_;
+  /// For each output column index, store the corresponding serializer input
+  /// column.
+  std::vector<column_index_t> serializerInputByOutput_;
 
   BlockingReason blockingReason_{BlockingReason::kNotBlocked};
   ContinueFuture future_;
diff --git a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
index af9f272e062..ed9fa875624 100644
--- a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
+++ b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
@@ -244,6 +244,64 @@ class OptimizedPartitionedOutputTest : public OperatorTestBase {
     return result;
   }
 
+  RowTypePtr outputTypeForLayout(
+      const RowTypePtr& inputType,
+      const std::vector<std::string>& outputLayout) {
+    if (outputLayout.empty()) {
+      return inputType;
+    }
+
+    std::vector<TypePtr> types;
+    types.reserve(outputLayout.size());
+    for (const auto& name : outputLayout) {
+      types.push_back(inputType->findChild(name));
+    }
+    return ROW(outputLayout, std::move(types));
+  }
+
+  RowVectorPtr buildOutput(
+      const RowVectorPtr& input,
+      const std::vector<std::string>& outputLayout) {
+    const auto inputType = asRowType(input->type());
+    const auto outputType = outputTypeForLayout(inputType, outputLayout);
+
+    std::vector<VectorPtr> columns;
+    columns.reserve(outputLayout.size());
+    for (const auto& name : outputLayout) {
+      columns.push_back(input->childAt(inputType->getChildIdx(name)));
+    }
+    return std::make_shared<RowVector>(
+        input->pool(), outputType, nullptr, input->size(), std::move(columns));
+  }
+
+  /// Sorts a vector by value for order-independent comparison. Returns a
+  /// dictionary vector with rows sorted in ascending order.
+  VectorPtr canonicalize(const VectorPtr& vector) {
+    const auto numRows = vector->size();
+    auto indices = makeIndices(numRows, [](auto i) { return i; });
+    auto* data = indices->asMutable<vector_size_t>();
+    std::stable_sort(data, data + numRows, [&](auto a, auto b) {
+      return vector->compare(vector.get(), a, b) < 0;
+    });
+    return BaseVector::wrapInDictionary(nullptr, indices, numRows, vector);
+  }
+
+  /// Builds a RowVector by gathering rows from inputBatches at the given
+  /// (batchIdx, rowIdx) positions. Used to construct the per-partition expected
+  /// RowVector.
+  RowVectorPtr gatherRows(
+      const std::vector<RowVectorPtr>& batches,
+      const std::vector<std::pair<int, int>>& rowList,
+      const RowTypePtr& rowType) {
+    const auto numRows = static_cast<vector_size_t>(rowList.size());
+    auto result = std::static_pointer_cast<RowVector>(
+        BaseVector::create(rowType, numRows, pool()));
+    for (vector_size_t r = 0; r < numRows; ++r) {
+      result->copy(batches[rowList[r].first].get(), r, rowList[r].second, 1);
+    }
+    return result;
+  }
+
   int64_t getIntRuntimeStat(Task* task, const std::string& statName) {
     const auto taskStats = task->taskStats();
     const auto& runtimeStats =
@@ -264,14 +322,34 @@ class OptimizedPartitionedOutputTest : public OperatorTestBase {
       int numPartitions,
       std::unordered_map<std::string, std::string> extraConfig = {},
       std::chrono::seconds timeout = std::chrono::seconds{30}) {
+    return runPartitionedOutputWithLayout(
+        taskId,
+        inputBatches,
+        partitionKeys,
+        numPartitions,
+        {},
+        std::move(extraConfig),
+        timeout);
+  }
+
+  PartitionedOutputResult runPartitionedOutputWithLayout(
+      const std::string& taskId,
+      const std::vector<RowVectorPtr>& inputBatches,
+      const std::vector<std::string>& partitionKeys,
+      int numPartitions,
+      const std::vector<std::string>& outputLayout,
+      std::unordered_map<std::string, std::string> extraConfig = {},
+      std::chrono::seconds timeout = std::chrono::seconds{30}) {
     VELOX_CHECK(!inputBatches.empty());
     const auto rowType =
         std::dynamic_pointer_cast<const RowType>(inputBatches[0]->type());
+    const auto outputType = outputTypeForLayout(rowType, outputLayout);
 
-    auto plan = PlanBuilder()
-                    .values(inputBatches)
-                    .partitionedOutput(partitionKeys, numPartitions)
-                    .planNode();
+    auto plan =
+        PlanBuilder()
+            .values(inputBatches)
+            .partitionedOutput(partitionKeys, numPartitions, outputLayout)
+            .planNode();
 
     auto task = Task::create(
         taskId,
@@ -306,7 +384,7 @@ class OptimizedPartitionedOutputTest : public OperatorTestBase {
       if (result.pageCounts[p] > 0) {
         ++result.numNonEmptyPartitions;
       }
-      result.rowCounts[p] = concatPages(result.pages[p], rowType)->size();
+      result.rowCounts[p] = concatPages(result.pages[p], outputType)->size();
     }
 
     result.numAppends = getIntRuntimeStat(task.get(), "numAppends");
@@ -446,34 +524,6 @@ class OptimizedPartitionedOutputParamTest
     return makeRowVector(names, vecs);
   }
 
-  /// Sorts a vector by value for order-independent comparison. Returns a
-  /// dictionary vector with rows sorted in ascending order.
-  VectorPtr canonicalize(const VectorPtr& vector) {
-    const auto numRows = vector->size();
-    auto indices = makeIndices(numRows, [](auto i) { return i; });
-    auto* data = indices->asMutable<vector_size_t>();
-    std::stable_sort(data, data + numRows, [&](auto a, auto b) {
-      return vector->compare(vector.get(), a, b) < 0;
-    });
-    return BaseVector::wrapInDictionary(nullptr, indices, numRows, vector);
-  }
-
-  /// Builds a RowVector by gathering rows from inputBatches at the given
-  /// (batchIdx, rowIdx) positions. Used to construct the per-partition expected
-  /// RowVector.
-  RowVectorPtr gatherRows(
-      const std::vector<RowVectorPtr>& batches,
-      const std::vector<std::pair<int, int>>& rowList,
-      const RowTypePtr& rowType) {
-    const auto numRows = static_cast<vector_size_t>(rowList.size());
-    auto result = std::static_pointer_cast<RowVector>(
-        BaseVector::create(rowType, numRows, pool()));
-    for (vector_size_t r = 0; r < numRows; ++r) {
-      result->copy(batches[rowList[r].first].get(), r, rowList[r].second, 1);
-    }
-    return result;
-  }
-
   /// Verifies that the deserialized pages for each partition exactly match the
   /// rows from inputBatches that were routed to that partition. Both expected
   /// and actual rows are sorted (canonicalized) before comparison to allow
@@ -922,4 +972,65 @@ TEST_F(OptimizedPartitionedOutputTest, replicateNullsAndAnyUnsupported) {
           "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput"));
 }
 
+TEST_F(OptimizedPartitionedOutputTest, outputLayout) {
+  auto input = makeRowVector(
+      {"p1", "v1", "v2", "unused"},
+      {makeFlatVector<int32_t>({0, 1, 2, 3, 4, 5, 6, 7}),
+       makeFlatVector<int64_t>({10, 11, 12, 13, 14, 15, 16, 17}),
+       makeFlatVector<int8_t>({20, 21, 22, 23, 24, 25, 26, 27}),
+       makeFlatVector<int64_t>({30, 31, 32, 33, 34, 35, 36, 37})});
+  auto inputCopy =
+      std::static_pointer_cast<RowVector>(BaseVector::copy(*input, pool()));
+
+  const std::vector<std::string> outputLayout = {"v2", "v1"};
+  const auto inputType = asRowType(input->type());
+  const auto outputType = outputTypeForLayout(inputType, outputLayout);
+  auto expected = buildOutput(inputCopy, outputLayout);
+
+  auto result = runPartitionedOutputWithLayout(
+      "local://test-optimized-output-layout", {input}, {}, 1, outputLayout);
+
+  auto actual = concatPages(result.pages[0], outputType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+TEST_F(OptimizedPartitionedOutputTest, duplicateOutputColumns) {
+  constexpr int kNumPartitions = 4;
+  auto input = makeRowVector(
+      {"p1", "v1"},
+      {makeFlatVector<int32_t>({0, 1, 2, 3, 0, 1, 2, 3}),
+       makeFlatVector<int64_t>({10, 11, 12, 13, 14, 15, 16, 17})});
+  auto inputCopy =
+      std::static_pointer_cast<RowVector>(BaseVector::copy(*input, pool()));
+  const std::vector<std::string> outputLayout = {"v1", "v1"};
+  const auto inputType = asRowType(input->type());
+  const auto outputType = outputTypeForLayout(inputType, outputLayout);
+  auto output = buildOutput(inputCopy, outputLayout);
+
+  auto result = runPartitionedOutputWithLayout(
+      "local://test-optimized-output-layout-duplicated-columns",
+      {input},
+      {"p1"},
+      kNumPartitions,
+      outputLayout);
+
+  std::vector<uint32_t> assignments(inputCopy->size());
+  auto partitionFn = std::make_unique<HashPartitionFunction>(
+      false, kNumPartitions, inputType, std::vector<column_index_t>{0});
+  partitionFn->partition(*inputCopy, assignments);
+
+  std::vector<std::vector<std::pair<int, int>>> expectedRows(kNumPartitions);
+  for (vector_size_t i = 0; i < assignments.size(); ++i) {
+    expectedRows[assignments[i]].emplace_back(0, i);
+  }
+
+  for (int p = 0; p < kNumPartitions; ++p) {
+    auto expected = gatherRows({output}, expectedRows[p], outputType);
+    auto actual = concatPages(result.pages[p], outputType);
+    ASSERT_EQ(expected->size(), actual->size()) << "partition " << p;
+    velox::test::assertEqualVectors(
+        canonicalize(expected), canonicalize(actual));
+  }
+}
+
 } // namespace facebook::velox::exec::test
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
index c7ccdbf652a..533b8d6bb75 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.cpp
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
@@ -313,7 +313,8 @@ std::unique_ptr<ColumnBufferState> ColumnBufferState::create(
           type->kind());
     default:
       VELOX_UNSUPPORTED(
-          "Unsupported type kind for createColumnBufferState: {}", type->kind());
+          "Unsupported type kind for createColumnBufferState: {}",
+          type->kind());
   }
 }
 
@@ -337,15 +338,20 @@ class BufferState {
       const RowTypePtr& type,
       uint32_t numPartitions);
 
-  void append(const PartitionedVectorPtr& partitionedVector) {
+  void append(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<column_index_t>& outputToInputChannels) {
     auto rowVector =
         std::dynamic_pointer_cast<PartitionedRowVector>(partitionedVector);
     VELOX_CHECK_NOT_NULL(rowVector);
 
     rowsBuffered_ += partitionedVector->baseVector()->size();
 
-    for (auto column = 0; column < children_.size(); ++column) {
-      children_[column]->append(rowVector->childAt(column));
+    for (column_index_t column = 0; column < children_.size(); ++column) {
+      const auto inputColumn = outputToInputChannels.empty()
+          ? column
+          : outputToInputChannels[column];
+      children_[column]->append(rowVector->childAt(inputColumn));
     }
 
     for (auto p = 0; p < numPartitions_; ++p) {
@@ -425,20 +431,26 @@ std::unique_ptr<BufferState> BufferState::create(
 }
 
 PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer(
-    RowTypePtr inputType,
+    RowTypePtr outputType,
     uint32_t numPartitions,
     const SerdeOpts& opts,
     memory::MemoryPool* pool,
+    std::vector<column_index_t> outputToInputChannels,
     std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory)
-    : type_(std::move(inputType)),
+    : outputType_(std::move(outputType)),
+      outputToInputChannels_(std::move(outputToInputChannels)),
       numPartitions_(numPartitions),
       opts_(opts),
       pool_(pool),
       listenerFactory_(std::move(listenerFactory)),
-      numColumns_(type_->size()),
-      bufferState_(BufferState::create(type_, numPartitions_)) {
+      numColumns_(outputType_->size()),
+      bufferState_(BufferState::create(outputType_, numPartitions_)) {
   VELOX_CHECK_GT(numPartitions_, 0);
   VELOX_CHECK_NOT_NULL(pool_);
+  VELOX_CHECK(
+      outputToInputChannels_.empty() ||
+          outputToInputChannels_.size() == outputType_->size(),
+      "outputToInputChannels size must match output column count");
 }
 
 PrestoIterativePartitioningSerializer::
@@ -457,9 +469,41 @@ void PrestoIterativePartitioningSerializer::clear() {
   bufferState_->clear();
 }
 
+void PrestoIterativePartitioningSerializer::validateOutputInputMapping(
+    const RowVectorPtr& input) const {
+  const auto numInputColumns = input->childrenSize();
+  for (column_index_t outputColumn = 0; outputColumn < numColumns_;
+       ++outputColumn) {
+    const auto inputColumn = outputToInputChannel(outputColumn);
+    VELOX_CHECK_LT(
+        inputColumn,
+        numInputColumns,
+        "Output column {} maps to invalid input column {}",
+        outputColumn,
+        inputColumn);
+
+    const auto& child = input->childAt(inputColumn);
+    VELOX_CHECK_NOT_NULL(
+        child,
+        "Output column {} maps to null input column {}",
+        outputColumn,
+        inputColumn);
+
+    const auto type = outputType_->childAt(outputColumn);
+    VELOX_CHECK(
+        child->type()->equivalent(*type),
+        "Output column {} expects {}, got {} from input column {}",
+        outputColumn,
+        type->toString(),
+        child->type()->toString(),
+        inputColumn);
+  }
+}
+
 int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend(
     const RowVectorPtr& input) const {
   VELOX_CHECK_NOT_NULL(input);
+  validateOutputInputMapping(input);
 
   if (input->size() == 0) {
     return bytesBuffered();
@@ -475,8 +519,17 @@ int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend(
   auto estimatedBytes =
       bufferState_->bytesBuffered() + numNewPartitions * (kHeaderSize + 4);
 
-  for (auto column = 0; column < numColumns_; ++column) {
-    const auto& columnType = type_->childAt(column);
+  // Cache per input column. If multiple output columns map to the same input
+  // column, reuse the already computed incremental bytes.
+  std::vector<std::optional<int64_t>> estimatedIncrementalBytes(
+      input->childrenSize());
+  for (column_index_t column = 0; column < numColumns_; ++column) {
+    const auto inputColumn = outputToInputChannel(column);
+    if (estimatedIncrementalBytes[inputColumn].has_value()) {
+      estimatedBytes += *estimatedIncrementalBytes[inputColumn];
+      continue;
+    }
+    const auto& columnType = outputType_->childAt(column);
     if (columnType->isUnknown()) {
       VELOX_UNSUPPORTED(
           "Unsupported type kind for "
@@ -484,7 +537,7 @@ int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend(
           columnType->kind());
     } else if (columnType->isFixedWidth()) {
       const auto* columnState = bufferState_->children()[column].get();
-      const auto inputNulls = countNulls(*input->childAt(column));
+      const auto inputNulls = countNulls(*input->childAt(inputColumn));
       const auto partitionsWithNulls = std::min<uint32_t>(
           bufferState_->numNonEmptyPartitions() + numNewPartitions,
           columnState->numPartitionsWithNulls() + inputNulls.value_or(numRows));
@@ -493,12 +546,13 @@ int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend(
       auto nullBitmapBytesBuffered = columnState->nullBitmapBytesBuffered();
       VELOX_DCHECK_GE(nullBitmapBytes, nullBitmapBytesBuffered);
 
-      estimatedBytes += numNewPartitions *
+      estimatedIncrementalBytes[inputColumn] = numNewPartitions *
               simpleColumnBytes(columnType, 0, 0) + // header growth
           nullBitmapBytes -
           nullBitmapBytesBuffered + // null bitmap growth
           static_cast<int64_t>(numRows - inputNulls.value_or(0)) *
               fixedTypeWidth(columnType->kind()); // value bytes growth
+      estimatedBytes += *estimatedIncrementalBytes[inputColumn];
     } else {
       switch (columnType->kind()) {
         case TypeKind::VARCHAR:
@@ -530,6 +584,8 @@ void PrestoIterativePartitioningSerializer::append(
       partitions.size(),
       "partitions.size() must equal input->size()");
 
+  validateOutputInputMapping(input);
+
   if (input->size() == 0) {
     return;
   }
@@ -542,7 +598,7 @@ void PrestoIterativePartitioningSerializer::append(
       ctx,
       pool_);
 
-  bufferState_->append(partitionedRowVector);
+  bufferState_->append(partitionedRowVector, outputToInputChannels_);
   partitionedRowVectors_.push_back(std::move(partitionedRowVector));
 }
 
@@ -575,7 +631,7 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
       nonEmptyPartitions.push_back(p);
     }
   }
-  const auto& rowSchema = type_->asRow();
+  const auto& rowSchema = outputType_->asRow();
 
   // 2. Create per-partition listeners first so the codec mask can be derived
   // from whether the factory actually produced a listener. The factory may
@@ -599,7 +655,6 @@ PrestoIterativePartitioningSerializer::flushUncompressed() {
   std::vector<std::streampos> beginStreamPositions(numPartitions_);
 
   for (uint32_t p : nonEmptyPartitions) {
-    listeners[p] = std::make_unique<PrestoOutputStreamListener>();
     outputStreams[p] = std::make_unique<IOBufOutputStream>(
         *pool_, listeners[p].get(), bufferState_->bytesPerPartition()[p]);
     rawOutputStreams[p] = outputStreams[p].get();
@@ -677,7 +732,8 @@ void PrestoIterativePartitioningSerializer::flushRowChildren(
       const auto& partitionedRowVector =
           std::dynamic_pointer_cast<PartitionedRowVector>(partitionedVector);
       VELOX_DCHECK_NOT_NULL(partitionedRowVector.get());
-      column.push_back(partitionedRowVector->childAt(col));
+      column.push_back(
+          partitionedRowVector->childAt(outputToInputChannel(col)));
     }
 
     flushColumn(
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h
index 88abeb49e5a..8ab7d31dc7e 100644
--- a/velox/serializers/PrestoIterativePartitioningSerializer.h
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.h
@@ -41,6 +41,19 @@ class BufferState;
 /// internal state so the serializer can be reused for the next cycle.
 class PrestoIterativePartitioningSerializer {
  public:
+  PrestoIterativePartitioningSerializer(
+      RowTypePtr outputType,
+      uint32_t numPartitions,
+      const SerdeOpts& opts,
+      memory::MemoryPool* pool)
+      : PrestoIterativePartitioningSerializer(
+            std::move(outputType),
+            numPartitions,
+            opts,
+            pool,
+            {},
+            nullptr) {}
+
   /// Constructs the serializer. If `listenerFactory` is non-null it is called
   /// once per non-empty partition on each flush to create an
   /// OutputStreamListener that accumulates the CRC32 checksum; the checksum
@@ -49,10 +62,29 @@ class PrestoIterativePartitioningSerializer {
   /// which matches the behavior of kNormal PartitionedOutput when
   /// OutputBufferManager has no listener factory set.
   PrestoIterativePartitioningSerializer(
-      RowTypePtr inputType,
+      RowTypePtr outputType,
       uint32_t numPartitions,
       const SerdeOpts& opts,
       memory::MemoryPool* pool,
+      std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory)
+      : PrestoIterativePartitioningSerializer(
+            std::move(outputType),
+            numPartitions,
+            opts,
+            pool,
+            {},
+            std::move(listenerFactory)) {}
+
+  /// Constructs the serializer with an explicit output-column to input-column
+  /// mapping. `outputToInputChannels[i]` indicates which child of the RowVector
+  /// passed to append() should be serialized for output column i. When empty,
+  /// output column i uses input child i.
+  PrestoIterativePartitioningSerializer(
+      RowTypePtr outputType,
+      uint32_t numPartitions,
+      const SerdeOpts& opts,
+      memory::MemoryPool* pool,
+      std::vector<column_index_t> outputToInputChannels,
       std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory =
           nullptr);
 
@@ -84,6 +116,14 @@ class PrestoIterativePartitioningSerializer {
   vector_size_t rowsBuffered() const;
 
  private:
+  void validateOutputInputMapping(const RowVectorPtr&) const;
+
+  column_index_t outputToInputChannel(column_index_t outputColumn) const {
+    return outputToInputChannels_.empty()
+        ? outputColumn
+        : outputToInputChannels_[outputColumn];
+  }
+
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
   flushUncompressed();
   std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
@@ -170,13 +210,15 @@ class PrestoIterativePartitioningSerializer {
       const std::vector<uint32_t>& nonEmptyPartitions,
       const std::vector<IOBufOutputStream*>& outputStreams) const;
 
-  RowTypePtr type_;
+  RowTypePtr outputType_;
+  std::vector<column_index_t> outputToInputChannels_;
   uint32_t numPartitions_;
   SerdeOpts opts_;
   memory::MemoryPool* pool_;
+
   std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory_;
 
-  /// Number of top-level columns in `type_`.
+  /// Number of top-level columns in `outputType_`.
   uint32_t numColumns_{0};
 
   std::vector<PartitionedVectorPtr> partitionedRowVectors_;
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
index 87a81ff996a..4116632f762 100644
--- a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -21,6 +21,7 @@
 #include <gtest/gtest.h>
 
 #include "velox/common/base/BitUtil.h"
+#include "velox/common/base/tests/GTestUtils.h"
 #include "velox/serializers/PrestoIterativePartitioningSerializer.h"
 
 #include "velox/serializers/PrestoSerializerSerializationUtils.h"
@@ -592,6 +593,69 @@ TEST_F(PrestoIterativePartitioningSerializerTest, bytesBufferedNullFlagGrowth) {
   EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs));
 }
 
+// A flush time output mapping serializes one input colum into multiple output
+// columns.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    duplicateOutputColumnAtFlush) {
+  auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()});
+  SerdeOpts opts;
+  auto serializer = std::make_unique<PrestoIterativePartitioningSerializer>(
+      outputType, 2, opts, pool_.get(), std::vector<column_index_t>{0, 0});
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 11, 12, 13})}),
+      {0, 1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, outputType);
+  auto r1 = deserialize(*ioBufs.at(1).first, outputType);
+
+  ASSERT_EQ(r0->size(), 2);
+  ASSERT_EQ(r1->size(), 2);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{10, 12}));
+  EXPECT_EQ(sortedValues<int64_t>(r0, 1), (std::vector<int64_t>{10, 12}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{11, 13}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 1), (std::vector<int64_t>{11, 13}));
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    outputInputMappingOutOfRange) {
+  auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()});
+  SerdeOpts opts;
+  auto serializer = std::make_unique<PrestoIterativePartitioningSerializer>(
+      outputType, 2, opts, pool_.get(), std::vector<column_index_t>{0, 1});
+
+  VELOX_ASSERT_THROW(
+      serializer->append(
+          makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 11})}), {0, 1}),
+      "Output column 1 maps to invalid input column 1");
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    outputInputMappingTypeMismatch) {
+  auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()});
+  SerdeOpts opts;
+  auto serializer = std::make_unique<PrestoIterativePartitioningSerializer>(
+      outputType, 2, opts, pool_.get(), std::vector<column_index_t>{0, 1});
+
+  VELOX_ASSERT_THROW(
+      serializer->append(
+          makeRowVector(
+              {"v1", "v2"},
+              {
+                  makeFlatVector<int64_t>({10, 11}),
+                  makeFlatVector<int32_t>({12, 13}),
+              }),
+          {0, 1}),
+      "Output column 1 expects BIGINT, got INTEGER from input column 1");
+}
+
 TEST_F(
     PrestoIterativePartitioningSerializerTest,
     estimateBytesAfterAppendExactForSinglePartition) {

From ecd87e4df1561e5cb401e109b0231496dccbe401 Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Sat, 9 May 2026 12:09:22 -0700
Subject: [PATCH 23/24] perf: Add AVX512 support

---
 scripts/setup-helper-functions.sh    | 17 +++++++++++++----
 velox/common/process/ProcessBase.cpp | 11 +++++++++++
 velox/common/process/ProcessBase.h   |  4 ++++
 velox/flag_definitions/flags.cpp     |  2 ++
 4 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh
index a50fb02ae0e..30bfb7d523d 100755
--- a/scripts/setup-helper-functions.sh
+++ b/scripts/setup-helper-functions.sh
@@ -81,7 +81,8 @@ function github_checkout {
 # The values that CPU_ARCH can take are as follows:
 #   arm64  : Target Apple silicon.
 #   aarch64: Target general 64 bit arm cpus.
-#   avx:     Target Intel CPUs with AVX.
+#   avx512:  Target Intel CPUs with AVX-512F.
+#   avx:     Target Intel CPUs with AVX2.
 #   sse:     Target Intel CPUs with sse.
 # Echo's the appropriate compiler flags which can be captured as so
 # CXX_FLAGS=$(get_cxx_flags) or
@@ -102,7 +103,9 @@ function get_cxx_flags {
       else # x86_64
         local CPU_CAPABILITIES
         CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}')
-        if [[ $CPU_CAPABILITIES =~ "avx" ]]; then
+        if [[ $CPU_CAPABILITIES =~ "avx512f" ]]; then
+          CPU_ARCH="avx512"
+        elif [[ $CPU_CAPABILITIES =~ "avx" ]]; then
           CPU_ARCH="avx"
         else
           CPU_ARCH="sse"
@@ -114,7 +117,9 @@ function get_cxx_flags {
       else # x86_64
         local CPU_CAPABILITIES
         CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1 | awk '{print tolower($0)}')
-        if [[ $CPU_CAPABILITIES =~ "avx" ]]; then
+        if [[ $CPU_CAPABILITIES =~ "avx512f" ]]; then
+          CPU_ARCH="avx512"
+        elif [[ $CPU_CAPABILITIES =~ "avx" ]]; then
           CPU_ARCH="avx"
         elif [[ $CPU_CAPABILITIES =~ "sse" ]]; then
           CPU_ARCH="sse"
@@ -131,8 +136,12 @@ function get_cxx_flags {
     echo -n "-mcpu=apple-m1+crc"
     ;;
 
+  "avx512")
+    echo -n "-mavx512f -mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2"
+    ;;
+
   "avx")
-    echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt  -mbmi2"
+    echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2"
     ;;
 
   "sse")
diff --git a/velox/common/process/ProcessBase.cpp b/velox/common/process/ProcessBase.cpp
index 0b9a4df2c64..3cbb7fa6a42 100644
--- a/velox/common/process/ProcessBase.cpp
+++ b/velox/common/process/ProcessBase.cpp
@@ -32,6 +32,8 @@ DECLARE_bool(avx2); // Enables use of AVX2 when available NOLINT
 
 DECLARE_bool(bmi2); // Enables use of BMI2 when available NOLINT
 
+DECLARE_bool(avx512f);
+
 namespace facebook {
 namespace velox {
 namespace process {
@@ -106,6 +108,7 @@ uint64_t threadCpuNanos() {
 namespace {
 bool bmi2CpuFlag = folly::CpuId().bmi2();
 bool avx2CpuFlag = folly::CpuId().avx2();
+bool avx512fCpuFlag = folly::CpuId().avx512f();
 } // namespace
 
 bool hasAvx2() {
@@ -124,6 +127,14 @@ bool hasBmi2() {
 #endif
 }
 
+bool hasAvx512f() {
+#ifdef __AVX512F__
+  return avx512fCpuFlag && FLAGS_avx512f;
+#else
+  return false;
+#endif
+}
+
 } // namespace process
 } // namespace velox
 } // namespace facebook
diff --git a/velox/common/process/ProcessBase.h b/velox/common/process/ProcessBase.h
index 34edd6d1467..7ca400b4efa 100644
--- a/velox/common/process/ProcessBase.h
+++ b/velox/common/process/ProcessBase.h
@@ -46,6 +46,10 @@ uint64_t threadCpuNanos();
 /// by flag.
 bool hasAvx2();
 
+/// True if the machine has Intel AVX512F instructions and these are not
+/// disabled by flag.
+bool hasAvx512f();
+
 /// True if the machine has Intel BMI2 instructions and these are not disabled
 /// by flag.
 bool hasBmi2();
diff --git a/velox/flag_definitions/flags.cpp b/velox/flag_definitions/flags.cpp
index 8648e80a68e..4adc6a5a22c 100644
--- a/velox/flag_definitions/flags.cpp
+++ b/velox/flag_definitions/flags.cpp
@@ -55,6 +55,8 @@ DEFINE_int32(
 
 DEFINE_bool(avx2, true, "Enables use of AVX2 when available");
 
+DEFINE_bool(avx512f, true, "Enables use of AVX512F when available");
+
 DEFINE_bool(bmi2, true, "Enables use of BMI2 when available");
 
 // Used in exec/Expr.cpp

From 28588ad4bfa582269c2a2a1f4e191d4351bf252d Mon Sep 17 00:00:00 2001
From: yingsu00 <yingsu00@outlook.com>
Date: Sat, 9 May 2026 14:38:05 -0700
Subject: [PATCH 24/24] perf: Introduce OptimizedHashPartitionFunction

Introduce OptimizedHashPartitionFunction as a faster drop-in replacement
for HashPartitionFunction, gated behind a new query config flag
optimized_hash_partition_function_enabled (default false). partition()
is improved from 50% to over 200x.

Add HashPartitionFunctionBase as a common base exposing numPartitions(),
and createHashPartitionFunction() factories that select the
implementation based on the flag. Thread QueryConfig* through
PartitionFunctionSpec::create() and update callsites (LocalPartition,
PartitionedOutput, MarkDistinct, RowNumber, Window,
SubPartitionedSortWindowBuild, HiveConnector) to construct partition
functions via the factory.

Register CMake targets for the new test and benchmark binaries.
---
 velox/connectors/hive/HiveConnector.cpp       |   3 +-
 velox/connectors/hive/HiveConnector.h         |   3 +-
 velox/core/PlanNode.h                         |   9 +-
 velox/core/QueryConfig.cpp                    |   1 +
 velox/core/QueryConfig.h                      |  10 +
 velox/exec/CMakeLists.txt                     |   1 +
 velox/exec/HashPartitionFunction.cpp          |  47 +-
 velox/exec/HashPartitionFunction.h            |  38 +-
 velox/exec/LocalPartition.cpp                 |  11 +-
 velox/exec/MarkDistinct.cpp                   |  10 +-
 velox/exec/MarkDistinct.h                     |   2 +-
 velox/exec/OptimizedHashPartitionFunction.cpp | 270 ++++++++++
 velox/exec/OptimizedHashPartitionFunction.h   |  72 +++
 velox/exec/OptimizedPartitionedOutput.cpp     |   8 +-
 velox/exec/PartitionedOutput.cpp              |  11 +-
 velox/exec/RoundRobinPartitionFunction.h      |   3 +-
 velox/exec/RowNumber.cpp                      |  10 +-
 velox/exec/RowNumber.h                        |   2 +-
 velox/exec/ScaleWriterLocalPartition.cpp      |   5 +-
 velox/exec/SubPartitionedSortWindowBuild.cpp  |  17 +-
 velox/exec/SubPartitionedSortWindowBuild.h    |   3 +-
 velox/exec/Window.cpp                         |   1 +
 velox/exec/benchmarks/CMakeLists.txt          |  12 +
 ...ptimizedHashPartitionFunctionBenchmark.cpp | 469 ++++++++++++++++++
 velox/exec/tests/CMakeLists.txt               |   1 +
 .../OptimizedHashPartitionFunctionTest.cpp    | 138 ++++++
 velox/exec/tests/utils/PlanBuilder.cpp        |   3 +-
 .../tests/utils/PartitionedVectorTestBase.cpp |   8 +-
 28 files changed, 1128 insertions(+), 40 deletions(-)
 create mode 100644 velox/exec/OptimizedHashPartitionFunction.cpp
 create mode 100644 velox/exec/OptimizedHashPartitionFunction.h
 create mode 100644 velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp
 create mode 100644 velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp

diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp
index 062a507fc64..f52aeb7dd37 100644
--- a/velox/connectors/hive/HiveConnector.cpp
+++ b/velox/connectors/hive/HiveConnector.cpp
@@ -132,7 +132,8 @@ void HiveConnector::registerSerDe() {
 
 std::unique_ptr<core::PartitionFunction> HivePartitionFunctionSpec::create(
     int numPartitions,
-    bool localExchange) const {
+    bool localExchange,
+    bool /*useOptimizedPartitionFunction*/) const {
   std::vector<int> bucketToPartitions;
   if (bucketToPartition_.empty()) {
     // NOTE: if hive partition function spec doesn't specify bucket to partition
diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h
index 95c175c4f69..e3508cb4729 100644
--- a/velox/connectors/hive/HiveConnector.h
+++ b/velox/connectors/hive/HiveConnector.h
@@ -141,7 +141,8 @@ class HivePartitionFunctionSpec : public core::PartitionFunctionSpec {
 
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool localExchange) const override;
+      bool localExchange,
+      bool useOptimizedPartitionFunction = false) const override;
 
   std::string toString() const override;
 
diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h
index 4a1ba1579cd..109caf0f45d 100644
--- a/velox/core/PlanNode.h
+++ b/velox/core/PlanNode.h
@@ -2500,9 +2500,13 @@ class PartitionFunctionSpec : public ISerializable {
  public:
   /// If 'localExchange' is true, the partition function is used for local
   /// exchange within a velox task.
+  /// TODO: useOptimizedPartitionFunction = true is only supported in
+  /// HashPartitionFunction now. Will extend the optimization to other
+  /// PartitionFunctions soon.
   virtual std::unique_ptr<PartitionFunction> create(
       int numPartitions,
-      bool localExchange = false) const = 0;
+      bool localExchange = false,
+      bool useOptimizedPartitionFunction = false) const = 0;
 
   virtual ~PartitionFunctionSpec() = default;
 
@@ -2515,7 +2519,8 @@ class GatherPartitionFunctionSpec : public PartitionFunctionSpec {
  public:
   std::unique_ptr<PartitionFunction> create(
       int /*numPartitions*/,
-      bool /*localExchange*/) const override {
+      bool /*localExchange*/,
+      bool /*useOptimizedPartitionFunction*/ = false) const override {
     VELOX_UNREACHABLE();
   }
 
diff --git a/velox/core/QueryConfig.cpp b/velox/core/QueryConfig.cpp
index 4a31862590a..8493d6546c7 100644
--- a/velox/core/QueryConfig.cpp
+++ b/velox/core/QueryConfig.cpp
@@ -90,6 +90,7 @@ const std::vector<config::ConfigProperty>& QueryConfig::registeredProperties() {
 
     // Partitioned output.
     VELOX_REGISTER_QUERY_CONFIG(kPartitionedOutputEagerFlush);
+    VELOX_REGISTER_QUERY_CONFIG(kOptimizedHashPartitionFunctionEnabled);
     VELOX_REGISTER_QUERY_CONFIG(kMaxPartitionedOutputBufferSize);
     VELOX_REGISTER_QUERY_CONFIG(kMaxOutputBufferSize);
 
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
index 0571284aedc..b30fb47bd1a 100644
--- a/velox/core/QueryConfig.h
+++ b/velox/core/QueryConfig.h
@@ -454,6 +454,16 @@ class QueryConfig {
       false,
       "Flush PartitionedOutput rows eagerly without buffering.")
 
+  /// If true, use OptimizedHashPartitionFunction in place of
+  /// HashPartitionFunction.
+  VELOX_QUERY_CONFIG(
+      kOptimizedHashPartitionFunctionEnabled,
+      optimizedHashPartitionFunctionEnabled,
+      "optimized_hash_partition_function_enabled",
+      bool,
+      false,
+      "Use OptimizedHashPartitionFunction instead of HashPartitionFunction.")
+
   /// The maximum number of bytes to buffer in PartitionedOutput operator to
   /// avoid creating tiny SerializedPages.
   VELOX_QUERY_CONFIG(
diff --git a/velox/exec/CMakeLists.txt b/velox/exec/CMakeLists.txt
index d77f0305bfd..626c7c06570 100644
--- a/velox/exec/CMakeLists.txt
+++ b/velox/exec/CMakeLists.txt
@@ -71,6 +71,7 @@ velox_add_library(
   OperatorTraceScan.cpp
   OperatorTraceWriter.cpp
   OperatorUtils.cpp
+  OptimizedHashPartitionFunction.cpp
   OptimizedPartitionedOutput.cpp
   OptimizedVectorHasher.cpp
   OrderBy.cpp
diff --git a/velox/exec/HashPartitionFunction.cpp b/velox/exec/HashPartitionFunction.cpp
index 896facc4efa..44f012e5e00 100644
--- a/velox/exec/HashPartitionFunction.cpp
+++ b/velox/exec/HashPartitionFunction.cpp
@@ -13,8 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <velox/exec/HashPartitionFunction.h>
-#include <velox/exec/VectorHasher.h>
+#include "velox/exec/HashPartitionFunction.h"
+
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+#include "velox/exec/VectorHasher.h"
 
 #define XXH_INLINE_ALL
 #include <xxhash.h> // @manual=third-party//xxHash:xxhash
@@ -123,9 +125,15 @@ std::optional<uint32_t> HashPartitionFunction::partition(
 
 std::unique_ptr<core::PartitionFunction> HashPartitionFunctionSpec::create(
     int numPartitions,
-    bool localExchange) const {
-  return std::make_unique<exec::HashPartitionFunction>(
-      localExchange, numPartitions, inputType_, keyChannels_, constValues_);
+    bool localExchange,
+    bool useOptimizedPartitionFunction) const {
+  return createHashPartitionFunction(
+      localExchange,
+      numPartitions,
+      inputType_,
+      keyChannels_,
+      constValues_,
+      useOptimizedPartitionFunction);
 }
 
 std::string HashPartitionFunctionSpec::toString() const {
@@ -180,4 +188,33 @@ core::PartitionFunctionSpecPtr HashPartitionFunctionSpec::deserialize(
   return std::make_shared<HashPartitionFunctionSpec>(
       ISerializable::deserialize<RowType>(obj["inputType"]), keys, constValues);
 }
+
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    bool localExchange,
+    int numPartitions,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues,
+    bool useOptimizedPartitionFunction) {
+  if (useOptimizedPartitionFunction) {
+    return std::make_unique<OptimizedHashPartitionFunction>(
+        localExchange, numPartitions, inputType, keyChannels, constValues);
+  }
+  return std::make_unique<HashPartitionFunction>(
+      localExchange, numPartitions, inputType, keyChannels, constValues);
+}
+
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    const HashBitRange& hashBitRange,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues,
+    bool useOptimizedPartitionFunction) {
+  if (useOptimizedPartitionFunction) {
+    return std::make_unique<OptimizedHashPartitionFunction>(
+        hashBitRange, inputType, keyChannels, constValues);
+  }
+  return std::make_unique<HashPartitionFunction>(
+      hashBitRange, inputType, keyChannels, constValues);
+}
 } // namespace facebook::velox::exec
diff --git a/velox/exec/HashPartitionFunction.h b/velox/exec/HashPartitionFunction.h
index 7aa6a032d6b..848fd42e0ac 100644
--- a/velox/exec/HashPartitionFunction.h
+++ b/velox/exec/HashPartitionFunction.h
@@ -15,19 +15,28 @@
  */
 #pragma once
 
-#include <velox/exec/HashBitRange.h>
-#include <velox/exec/VectorHasher.h>
 #include "velox/core/PlanNode.h"
+#include "velox/exec/HashBitRange.h"
+#include "velox/exec/VectorHasher.h"
 
 namespace facebook::velox::exec {
 
+class HashPartitionFunctionBase : public core::PartitionFunction {
+ public:
+  ~HashPartitionFunctionBase() override = default;
+
+  virtual int numPartitions() const = 0;
+};
+
 /// Calculates partition number for each row of the specified vector using a
 /// hash function. The constructor with hashBitRange parameter requires both
 /// hashBitRange and keyChannels to be non-empty. The constructor with
 /// numPartitions allows the keyChannels argument to be empty. If keyChannels is
 /// empty, then the resulting partition number of partition() will always be
 /// zero.
-class HashPartitionFunction : public core::PartitionFunction {
+/// Extends PartitionFunction with access to the configured number of
+/// partitions.
+class HashPartitionFunction : public HashPartitionFunctionBase {
  public:
   HashPartitionFunction(
       bool localExchange,
@@ -48,7 +57,7 @@ class HashPartitionFunction : public core::PartitionFunction {
       const RowVector& input,
       std::vector<uint32_t>& partitions) override;
 
-  int numPartitions() const {
+  int numPartitions() const override {
     return numPartitions_;
   }
 
@@ -85,7 +94,8 @@ class HashPartitionFunctionSpec : public core::PartitionFunctionSpec {
 
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool localExchange) const override;
+      bool localExchange,
+      bool useOptimizedPartitionFunction = false) const override;
 
   std::string toString() const override;
 
@@ -100,4 +110,22 @@ class HashPartitionFunctionSpec : public core::PartitionFunctionSpec {
   const std::vector<column_index_t> keyChannels_;
   const std::vector<VectorPtr> constValues_;
 };
+
+/// Creates either HashPartitionFunction or OptimizedHashPartitionFunction
+/// based on 'useOptimizedPartitionFunction'.
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    bool localExchange,
+    int numPartitions,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues = {},
+    bool useOptimizedPartitionFunction = false);
+
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    const HashBitRange& hashBitRange,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues = {},
+    bool useOptimizedPartitionFunction = false);
+
 } // namespace facebook::velox::exec
diff --git a/velox/exec/LocalPartition.cpp b/velox/exec/LocalPartition.cpp
index eb6eb81add3..231b873d7fa 100644
--- a/velox/exec/LocalPartition.cpp
+++ b/velox/exec/LocalPartition.cpp
@@ -339,10 +339,13 @@ LocalPartition::LocalPartition(
           ctx->task->getLocalExchangeQueues(ctx->splitGroupId, planNode->id())},
       numPartitions_{queues_.size()},
       partitionFunction_(
-          numPartitions_ == 1 ? nullptr
-                              : planNode->partitionFunctionSpec().create(
-                                    numPartitions_,
-                                    /*localExchange=*/true)),
+          numPartitions_ == 1
+              ? nullptr
+              : planNode->partitionFunctionSpec().create(
+                    numPartitions_,
+                    /*localExchange=*/true,
+                    ctx->queryConfig()
+                        .optimizedHashPartitionFunctionEnabled())),
       singlePartitionBufferSize_{
           (numPartitions_ <
                ctx->queryConfig()
diff --git a/velox/exec/MarkDistinct.cpp b/velox/exec/MarkDistinct.cpp
index 2b562c714af..83ae15a2cad 100644
--- a/velox/exec/MarkDistinct.cpp
+++ b/velox/exec/MarkDistinct.cpp
@@ -356,8 +356,14 @@ void MarkDistinct::setupInputSpiller(
       &spillConfig_.value(),
       spillStats_.get());
 
-  spillHashFunction_ = std::make_unique<HashPartitionFunction>(
-      inputSpiller_->hashBits(), inputType_, distinctKeyChannels_);
+  spillHashFunction_ = createHashPartitionFunction(
+      inputSpiller_->hashBits(),
+      inputType_,
+      distinctKeyChannels_,
+      {},
+      operatorCtx_->driverCtx()
+          ->queryConfig()
+          .optimizedHashPartitionFunctionEnabled());
 }
 
 void MarkDistinct::spill() {
diff --git a/velox/exec/MarkDistinct.h b/velox/exec/MarkDistinct.h
index c8c582b5ea8..f386ff77bd9 100644
--- a/velox/exec/MarkDistinct.h
+++ b/velox/exec/MarkDistinct.h
@@ -106,7 +106,7 @@ class MarkDistinct : public Operator {
 
   SpillPartitionSet spillInputPartitionSet_;
 
-  std::unique_ptr<HashPartitionFunction> spillHashFunction_;
+  std::unique_ptr<HashPartitionFunctionBase> spillHashFunction_;
 
   SpillPartitionSet spillHashTablePartitionSet_;
 
diff --git a/velox/exec/OptimizedHashPartitionFunction.cpp b/velox/exec/OptimizedHashPartitionFunction.cpp
new file mode 100644
index 00000000000..ac83b7d8d27
--- /dev/null
+++ b/velox/exec/OptimizedHashPartitionFunction.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+
+#include <algorithm>
+
+#include <folly/Portability.h>
+
+#include "velox/common/process/ProcessBase.h"
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+#define XXH_INLINE_ALL
+#include <xxhash.h> // @manual=third-party//xxHash:xxhash
+
+namespace facebook::velox::exec {
+namespace {
+// Gets the hash value for local exchange with given 'rawHash'. 'rawHash'
+// is the value computed by this hash function which is used for remote
+// shuffle across stages like for Prestissimo.
+static inline uint32_t localExchangeHash(uint32_t rawHash) {
+  // Mix the bits so we don't use the same hash used to distribute between
+  // stages.
+  bits::reverseBits(reinterpret_cast<uint8_t*>(&rawHash), sizeof(rawHash));
+  return XXH32(&rawHash, sizeof(rawHash), 0);
+}
+
+FOLLY_ALWAYS_INLINE uint32_t mixedHash(uint64_t hash) {
+  return static_cast<uint32_t>(hash) ^ static_cast<uint32_t>(hash >> 32);
+}
+
+FOLLY_ALWAYS_INLINE uint32_t
+reduceRange(uint64_t hash, uint32_t numPartitions) {
+  return (static_cast<uint64_t>(mixedHash(hash)) * numPartitions) >> 32;
+}
+
+void rangeReductionPowerOfTwo(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  VELOX_DCHECK(bits::isPowerOfTwo(numPartitions));
+
+  if (numPartitions == 1) {
+    std::fill(partitions, partitions + size, 0);
+    return;
+  }
+
+  const auto shift = 32 - __builtin_ctz(numPartitions);
+  for (vector_size_t index = 0; index < size; ++index) {
+    partitions[index] = mixedHash(hashes[index]) >> shift;
+  }
+}
+
+#if defined(__AVX512F__)
+void rangeReductionAvx512(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  const __m512i numPartitionsVec = _mm512_set1_epi64(numPartitions);
+
+  vector_size_t index = 0;
+  for (; index + 8 <= size; index += 8) {
+    const auto hashesVec =
+        _mm512_loadu_si512(reinterpret_cast<const __m512i*>(hashes + index));
+
+    const auto mixedHashesVec =
+        _mm512_xor_si512(hashesVec, _mm512_srli_epi64(hashesVec, 32));
+    const auto productVec = _mm512_mul_epu32(mixedHashesVec, numPartitionsVec);
+    const auto shiftedVec = _mm512_srli_epi64(productVec, 32);
+    const auto packedResults = _mm512_cvtepi64_epi32(shiftedVec);
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(partitions + index), packedResults);
+  }
+
+  for (; index < size; ++index) {
+    partitions[index] = reduceRange(hashes[index], numPartitions);
+  }
+}
+#endif
+
+#if defined(__AVX2__)
+void rangeReductionAvx2(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  const auto packIndexes = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
+  const auto numPartitionsVec = _mm256_set1_epi64x(numPartitions);
+
+  vector_size_t index = 0;
+  for (; index + 4 <= size; index += 4) {
+    const auto hashesVec =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(hashes + index));
+    const auto mixedHashesVec =
+        _mm256_xor_si256(hashesVec, _mm256_srli_epi64(hashesVec, 32));
+    const auto productVec = _mm256_mul_epu32(mixedHashesVec, numPartitionsVec);
+    const auto shiftedVec = _mm256_srli_epi64(productVec, 32);
+    const auto packedResults =
+        _mm256_permutevar8x32_epi32(shiftedVec, packIndexes);
+    _mm_storeu_si128(
+        reinterpret_cast<__m128i*>(partitions + index),
+        _mm256_castsi256_si128(packedResults));
+  }
+
+  for (; index < size; ++index) {
+    partitions[index] = reduceRange(hashes[index], numPartitions);
+  }
+}
+#endif
+
+void rangeReductionImpl(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  if (bits::isPowerOfTwo(numPartitions)) {
+    rangeReductionPowerOfTwo(hashes, partitions, size, numPartitions);
+    return;
+  }
+
+#if defined(__AVX512F__)
+  if (process::hasAvx512f()) {
+    rangeReductionAvx512(hashes, partitions, size, numPartitions);
+    return;
+  }
+#endif
+
+#if defined(__AVX2__)
+  if (process::hasAvx2()) {
+    rangeReductionAvx2(hashes, partitions, size, numPartitions);
+    return;
+  }
+#endif
+
+  for (vector_size_t index = 0; index < size; ++index) {
+    partitions[index] = reduceRange(hashes[index], numPartitions);
+  }
+}
+
+void applyLocalExchangeHash(raw_vector<uint64_t>& hashes) {
+  for (auto& hash : hashes) {
+    hash = localExchangeHash(hash);
+  }
+}
+
+void applyHashBitRange(
+    const HashBitRange& hashBitRange,
+    const raw_vector<uint64_t>& hashes,
+    std::vector<uint32_t>& partitions) {
+  partitions.resize(hashes.size());
+  for (auto index = 0; index < hashes.size(); ++index) {
+    partitions[index] = hashBitRange.partition(hashes[index]);
+  }
+}
+
+} // namespace
+
+void rangeReduction(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  rangeReductionImpl(hashes, partitions, size, numPartitions);
+}
+
+OptimizedHashPartitionFunction::OptimizedHashPartitionFunction(
+    bool localExchange,
+    int numPartitions,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues)
+    : localExchange_{localExchange}, numPartitions_{numPartitions} {
+  init(inputType, keyChannels, constValues);
+}
+
+OptimizedHashPartitionFunction::OptimizedHashPartitionFunction(
+    const HashBitRange& hashBitRange,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues)
+    : localExchange_{false},
+      numPartitions_{hashBitRange.numPartitions()},
+      hashBitRange_(hashBitRange) {
+  VELOX_CHECK_GT(hashBitRange.numPartitions(), 0);
+  VELOX_CHECK(!keyChannels.empty());
+  init(inputType, keyChannels, constValues);
+}
+
+std::optional<uint32_t> OptimizedHashPartitionFunction::partition(
+    const RowVector& input,
+    std::vector<uint32_t>& partitions) {
+  if (hashers_.empty()) {
+    return 0u;
+  }
+
+  const auto size = input.size();
+  if (size == 0) {
+    partitions.clear();
+    return std::nullopt;
+  }
+
+  if (!hashBitRange_.has_value() && numPartitions_ == 1) {
+    return 0u;
+  }
+
+  rows_.resize(size);
+  rows_.setAll();
+
+  hashes_.resize(size);
+  for (auto i = 0; i < hashers_.size(); ++i) {
+    auto& hasher = hashers_[i];
+    if (hasher->channel() != kConstantChannel) {
+      hashers_[i]->decode(*input.childAt(hasher->channel()), rows_);
+      hashers_[i]->hash(rows_, i > 0, hashes_);
+    } else {
+      hashers_[i]->hashPrecomputed(i > 0, hashes_);
+    }
+  }
+
+  if (localExchange_) {
+    applyLocalExchangeHash(hashes_);
+  }
+
+  if (hashBitRange_.has_value()) {
+    applyHashBitRange(*hashBitRange_, hashes_, partitions);
+  } else {
+    partitions.resize(size);
+    rangeReduction(hashes_.data(), partitions.data(), size, numPartitions_);
+  }
+
+  return std::nullopt;
+}
+
+void OptimizedHashPartitionFunction::init(
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues) {
+  hashers_.reserve(keyChannels.size());
+  size_t constChannel{0};
+  for (const auto channel : keyChannels) {
+    if (channel != kConstantChannel) {
+      hashers_.emplace_back(
+          OptimizedVectorHasher::create(inputType->childAt(channel), channel));
+    } else {
+      const auto& constValue = constValues[constChannel++];
+      hashers_.emplace_back(
+          OptimizedVectorHasher::create(constValue->type(), channel));
+      hashers_.back()->precompute(*constValue);
+    }
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedHashPartitionFunction.h b/velox/exec/OptimizedHashPartitionFunction.h
new file mode 100644
index 00000000000..bc7ceb1ac0b
--- /dev/null
+++ b/velox/exec/OptimizedHashPartitionFunction.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/exec/HashPartitionFunction.h"
+#include "velox/exec/OptimizedVectorHasher.h"
+
+namespace facebook::velox::exec {
+
+/// Maps hashes to partitions using range reduction. Visible for testing.
+void rangeReduction(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions);
+
+/// Calculates partition numbers using OptimizedVectorHasher.
+class OptimizedHashPartitionFunction : public HashPartitionFunctionBase {
+ public:
+  OptimizedHashPartitionFunction(
+      bool localExchange,
+      int numPartitions,
+      const RowTypePtr& inputType,
+      const std::vector<column_index_t>& keyChannels,
+      const std::vector<VectorPtr>& constValues = {});
+
+  OptimizedHashPartitionFunction(
+      const HashBitRange& hashBitRange,
+      const RowTypePtr& inputType,
+      const std::vector<column_index_t>& keyChannels,
+      const std::vector<VectorPtr>& constValues = {});
+
+  ~OptimizedHashPartitionFunction() override = default;
+
+  std::optional<uint32_t> partition(
+      const RowVector& input,
+      std::vector<uint32_t>& partitions) override;
+
+  int numPartitions() const override {
+    return numPartitions_;
+  }
+
+ private:
+  void init(
+      const RowTypePtr& inputType,
+      const std::vector<column_index_t>& keyChannels,
+      const std::vector<VectorPtr>& constValues);
+
+  const bool localExchange_;
+  const int numPartitions_;
+  const std::optional<HashBitRange> hashBitRange_ = std::nullopt;
+  std::vector<std::unique_ptr<OptimizedVectorHasher>> hashers_;
+
+  // Reusable memory.
+  SelectivityVector rows_;
+  raw_vector<uint64_t> hashes_;
+};
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp
index d9983a18cbd..a8da3786b81 100644
--- a/velox/exec/OptimizedPartitionedOutput.cpp
+++ b/velox/exec/OptimizedPartitionedOutput.cpp
@@ -54,9 +54,11 @@ OptimizedPartitionedOutput::OptimizedPartitionedOutput(
                                 .maxPartitionedOutputBufferSize()),
       pool_(pool()),
       partitionFunction_(
-          numDestinations_ == 1
-              ? nullptr
-              : planNode->partitionFunctionSpec().create(numDestinations_)) {
+          numDestinations_ == 1 ? nullptr
+                                : planNode->partitionFunctionSpec().create(
+                                      numDestinations_,
+                                      /*localExchange=*/false,
+                                      true)) {
   if (!planNode->isPartitioned()) {
     VELOX_USER_CHECK_EQ(numDestinations_, 1);
   }
diff --git a/velox/exec/PartitionedOutput.cpp b/velox/exec/PartitionedOutput.cpp
index ba4e23d738b..74320389489 100644
--- a/velox/exec/PartitionedOutput.cpp
+++ b/velox/exec/PartitionedOutput.cpp
@@ -207,10 +207,13 @@ PartitionedOutput::PartitionedOutput(
       numDestinations_(planNode->numPartitions()),
       replicateNullsAndAny_(planNode->isReplicateNullsAndAny()),
       partitionFunction_(
-          numDestinations_ == 1 ? nullptr
-                                : planNode->partitionFunctionSpec().create(
-                                      numDestinations_,
-                                      /*localExchange=*/false)),
+          numDestinations_ == 1
+              ? nullptr
+              : planNode->partitionFunctionSpec().create(
+                    numDestinations_,
+                    /*localExchange=*/false,
+                    ctx->queryConfig()
+                        .optimizedHashPartitionFunctionEnabled())),
       outputChannels_(calculateOutputChannels(
           planNode->inputType(),
           planNode->outputType(),
diff --git a/velox/exec/RoundRobinPartitionFunction.h b/velox/exec/RoundRobinPartitionFunction.h
index b84c6d2ffaf..a13ed529f55 100644
--- a/velox/exec/RoundRobinPartitionFunction.h
+++ b/velox/exec/RoundRobinPartitionFunction.h
@@ -43,7 +43,8 @@ class RoundRobinPartitionFunctionSpec : public core::PartitionFunctionSpec {
  public:
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool /*localExchange*/) const override {
+      bool /*localExchange*/,
+      bool /*useOptimizedPartitionFunction*/ = false) const override {
     return std::make_unique<velox::exec::RoundRobinPartitionFunction>(
         numPartitions);
   }
diff --git a/velox/exec/RowNumber.cpp b/velox/exec/RowNumber.cpp
index cd2cd4ce36a..04427975120 100644
--- a/velox/exec/RowNumber.cpp
+++ b/velox/exec/RowNumber.cpp
@@ -449,8 +449,14 @@ void RowNumber::setupInputSpiller(
     keyChannels.push_back(hasher->channel());
   }
 
-  spillHashFunction_ = std::make_unique<HashPartitionFunction>(
-      inputSpiller_->hashBits(), inputType_, keyChannels);
+  spillHashFunction_ = createHashPartitionFunction(
+      inputSpiller_->hashBits(),
+      inputType_,
+      keyChannels,
+      {},
+      operatorCtx_->driverCtx()
+          ->queryConfig()
+          .optimizedHashPartitionFunctionEnabled());
 }
 
 void RowNumber::spill() {
diff --git a/velox/exec/RowNumber.h b/velox/exec/RowNumber.h
index b34fc9d9c20..8e53713fc77 100644
--- a/velox/exec/RowNumber.h
+++ b/velox/exec/RowNumber.h
@@ -142,7 +142,7 @@ class RowNumber : public Operator {
   SpillPartitionSet spillInputPartitionSet_;
 
   // Used to calculate the spill partition numbers of the inputs.
-  std::unique_ptr<HashPartitionFunction> spillHashFunction_;
+  std::unique_ptr<HashPartitionFunctionBase> spillHashFunction_;
 
   // The cpu may be voluntarily yield after running too long when processing
   // input from spilled file.
diff --git a/velox/exec/ScaleWriterLocalPartition.cpp b/velox/exec/ScaleWriterLocalPartition.cpp
index 7530ff403a0..1764adabf6a 100644
--- a/velox/exec/ScaleWriterLocalPartition.cpp
+++ b/velox/exec/ScaleWriterLocalPartition.cpp
@@ -57,7 +57,10 @@ ScaleWriterPartitioningLocalPartition::ScaleWriterPartitioningLocalPartition(
       ? nullptr
       : planNode->partitionFunctionSpec().create(
             numTablePartitions_,
-            /*localExchange=*/true);
+            /*localExchange=*/true,
+            operatorCtx_->driverCtx()
+                ->queryConfig()
+                .optimizedHashPartitionFunctionEnabled());
 }
 
 void ScaleWriterPartitioningLocalPartition::initialize() {
diff --git a/velox/exec/SubPartitionedSortWindowBuild.cpp b/velox/exec/SubPartitionedSortWindowBuild.cpp
index 2f2a247a8d4..db437748fbb 100644
--- a/velox/exec/SubPartitionedSortWindowBuild.cpp
+++ b/velox/exec/SubPartitionedSortWindowBuild.cpp
@@ -22,6 +22,7 @@ namespace facebook::velox::exec {
 SubPartitionedSortWindowBuild::SubPartitionedSortWindowBuild(
     const std::shared_ptr<const core::WindowNode>& node,
     int32_t numSubPartitions,
+    const core::QueryConfig& queryConfig,
     velox::memory::MemoryPool* pool,
     common::PrefixSortConfig&& prefixSortConfig,
     const common::SpillConfig* spillConfig,
@@ -40,8 +41,13 @@ SubPartitionedSortWindowBuild::SubPartitionedSortWindowBuild(
   for (int i = 0; i < numPartitionKeys_; i++) {
     keyChannels[i] = inputChannels_[i];
   }
-  subPartitioningFunction_ = std::make_unique<HashPartitionFunction>(
-      false, numSubPartitions_, node->inputType(), keyChannels);
+  subPartitioningFunction_ = createHashPartitionFunction(
+      /*localExchange=*/false,
+      numSubPartitions_,
+      node->inputType(),
+      keyChannels,
+      {},
+      queryConfig.optimizedHashPartitionFunctionEnabled());
   subWindowBuilds_.resize(numSubPartitions_);
   for (int i = 0; i < numSubPartitions_; i++) {
     subWindowBuilds_[i] = std::make_unique<SortWindowBuild>(
@@ -59,7 +65,12 @@ void SubPartitionedSortWindowBuild::addInput(RowVectorPtr input) {
   VELOX_CHECK_LT(currentSubPartition_, 0);
 
   subPartitionIdsBuffer_.resize(input->size());
-  subPartitioningFunction_->partition(*input, subPartitionIdsBuffer_);
+  std::optional<uint32_t> singlePartition =
+      subPartitioningFunction_->partition(*input, subPartitionIdsBuffer_);
+  if (singlePartition.has_value()) {
+    simd::simdFill<uint32_t>(
+        subPartitionIdsBuffer_.data(), singlePartition.value(), input->size());
+  }
 
   for (auto i = 0; i < inputChannels_.size(); ++i) {
     decodedInputVectors_[i].decode(*input->childAt(inputChannels_[i]));
diff --git a/velox/exec/SubPartitionedSortWindowBuild.h b/velox/exec/SubPartitionedSortWindowBuild.h
index 8735f438d30..f0da95bdf95 100644
--- a/velox/exec/SubPartitionedSortWindowBuild.h
+++ b/velox/exec/SubPartitionedSortWindowBuild.h
@@ -33,6 +33,7 @@ class SubPartitionedSortWindowBuild : public WindowBuild {
   SubPartitionedSortWindowBuild(
       const std::shared_ptr<const core::WindowNode>& node,
       int32_t numSubPartitions,
+      const core::QueryConfig& queryConfig,
       velox::memory::MemoryPool* pool,
       common::PrefixSortConfig&& prefixSortConfig,
       const common::SpillConfig* spillConfig,
@@ -80,7 +81,7 @@ class SubPartitionedSortWindowBuild : public WindowBuild {
   exec::SpillStats* const spillStats_;
 
   // Divide input rows to the corresponding sub partitions.
-  std::unique_ptr<HashPartitionFunction> subPartitioningFunction_;
+  std::unique_ptr<HashPartitionFunctionBase> subPartitioningFunction_;
 
   // WindowBuilds for each sub partition.
   std::vector<std::unique_ptr<SortWindowBuild>> subWindowBuilds_;
diff --git a/velox/exec/Window.cpp b/velox/exec/Window.cpp
index f9107522f0a..b763371a801 100644
--- a/velox/exec/Window.cpp
+++ b/velox/exec/Window.cpp
@@ -75,6 +75,7 @@ Window::Window(
       windowBuild_ = std::make_unique<SubPartitionedSortWindowBuild>(
           windowNode,
           numSubPartitions,
+          driverCtx->queryConfig(),
           pool(),
           makePrefixSortConfig(driverCtx->queryConfig()),
           spillConfig,
diff --git a/velox/exec/benchmarks/CMakeLists.txt b/velox/exec/benchmarks/CMakeLists.txt
index 3962d439833..3ccff61baae 100644
--- a/velox/exec/benchmarks/CMakeLists.txt
+++ b/velox/exec/benchmarks/CMakeLists.txt
@@ -29,6 +29,18 @@ target_link_libraries(
   Folly::follybenchmark
 )
 
+add_executable(
+  velox_exec_optimized_hash_partition_function_benchmark
+  OptimizedHashPartitionFunctionBenchmark.cpp
+)
+
+target_link_libraries(
+  velox_exec_optimized_hash_partition_function_benchmark
+  velox_exec
+  velox_vector_test_lib
+  Folly::follybenchmark
+)
+
 add_executable(velox_filter_project_benchmark FilterProjectBenchmark.cpp)
 
 target_link_libraries(
diff --git a/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp b/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp
new file mode 100644
index 00000000000..3d2635fda94
--- /dev/null
+++ b/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/tests/utils/VectorMaker.h"
+
+// Add the following definitions to allow Clion runs.
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+constexpr vector_size_t kSize = 10'000;
+constexpr vector_size_t kDictionarySize = kSize / 5;
+
+enum class FunctionKind {
+  kNormal,
+  kOptimized,
+};
+
+enum class EncodingMode {
+  kFlat,
+  kDictionary,
+  kConstant,
+};
+
+enum class NullMode {
+  kNoNulls,
+  kHalfNulls,
+  kAllNulls,
+};
+
+enum class PartitionMode {
+  kRemote,
+  kLocalExchange,
+  kHashBitRangeFirst8,
+  kHashBitRangeLast8,
+};
+
+template <typename T>
+T makeValue(vector_size_t row) {
+  return static_cast<T>((row * 8191) ^ (row >> 3));
+}
+
+template <>
+bool makeValue<bool>(vector_size_t row) {
+  return (row & 1) == 0;
+}
+
+template <>
+StringView makeValue<StringView>(vector_size_t row) {
+  thread_local std::array<char, 20> buffer;
+  const auto length = 5 + row % 16;
+  for (vector_size_t index = 0; index < length; ++index) {
+    buffer[index] = 'a' + (row + index * 7) % 26;
+  }
+  return StringView(buffer.data(), length);
+}
+
+std::function<bool(vector_size_t)> makeNulls(NullMode nullMode) {
+  switch (nullMode) {
+    case NullMode::kNoNulls:
+      return nullptr;
+    case NullMode::kHalfNulls:
+      return [](vector_size_t row) { return (row & 1) == 0; };
+    case NullMode::kAllNulls:
+      return [](vector_size_t /*row*/) { return true; };
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+VectorPtr wrapInDictionary(
+    const VectorPtr& base,
+    vector_size_t size,
+    memory::MemoryPool* pool,
+    NullMode nullMode = NullMode::kNoNulls) {
+  auto indices = AlignedBuffer::allocate<vector_size_t>(size, pool);
+  auto* rawIndices = indices->asMutable<vector_size_t>();
+  const auto baseSize = base->size();
+  for (vector_size_t row = 0; row < size; ++row) {
+    rawIndices[row] = (size - row - 1) % baseSize;
+  }
+
+  BufferPtr nulls;
+  if (nullMode == NullMode::kHalfNulls) {
+    nulls = AlignedBuffer::allocate<bool>(size, pool);
+    auto* rawNulls = nulls->asMutable<uint64_t>();
+    bits::fillBits(rawNulls, 0, size, bits::kNotNull);
+    for (vector_size_t row = 0; row < size; row += 2) {
+      bits::setNull(rawNulls, row);
+    }
+  } else if (nullMode == NullMode::kAllNulls) {
+    nulls = AlignedBuffer::allocate<bool>(size, pool);
+    auto* rawNulls = nulls->asMutable<uint64_t>();
+    bits::fillBits(rawNulls, 0, size, bits::kNull);
+  }
+
+  return BaseVector::wrapInDictionary(nulls, indices, size, base);
+}
+
+template <typename T>
+VectorPtr makeValuesVector(
+    VectorMaker& vectorMaker,
+    memory::MemoryPool* pool,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    vector_size_t size) {
+  const auto flatSize =
+      encodingMode == EncodingMode::kDictionary ? kDictionarySize : size;
+  auto flat = vectorMaker.flatVector<T>(
+      flatSize,
+      [](vector_size_t row) { return makeValue<T>(row); },
+      makeNulls(nullMode));
+
+  switch (encodingMode) {
+    case EncodingMode::kFlat:
+      return flat;
+    case EncodingMode::kDictionary:
+      return wrapInDictionary(flat, size, pool);
+    case EncodingMode::kConstant:
+      if (nullMode == NullMode::kAllNulls) {
+        return BaseVector::createNullConstant(
+            CppToType<T>::create(), size, pool);
+      }
+      if (nullMode == NullMode::kHalfNulls) {
+        auto constant = BaseVector::wrapInConstant(size, 1, flat);
+        // ConstantVector has one nullness for all logical rows. Use a
+        // dictionary wrapper to express alternating nulls while keeping the
+        // repeated-value payload constant.
+        return wrapInDictionary(constant, size, pool, nullMode);
+      }
+      return BaseVector::wrapInConstant(size, 0, flat);
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+template <FunctionKind Kind>
+std::unique_ptr<HashPartitionFunctionBase> makePartitionFunction(
+    PartitionMode partitionMode,
+    const RowTypePtr& inputType,
+    int numPartitions) {
+  switch (partitionMode) {
+    case PartitionMode::kRemote:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            false, numPartitions, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            false, numPartitions, inputType, std::vector<column_index_t>{0});
+      }
+    case PartitionMode::kLocalExchange:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            true, numPartitions, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            true, numPartitions, inputType, std::vector<column_index_t>{0});
+      }
+    case PartitionMode::kHashBitRangeFirst8:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            HashBitRange{0, 8}, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            HashBitRange{0, 8}, inputType, std::vector<column_index_t>{0});
+      }
+    case PartitionMode::kHashBitRangeLast8:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            HashBitRange{56, 64}, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            HashBitRange{56, 64}, inputType, std::vector<column_index_t>{0});
+      }
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+void normalRangeReduction(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    int size,
+    uint32_t numPartitions) {
+  for (int index = 0; index < size; ++index) {
+    partitions[index] = hashes[index] % numPartitions;
+  }
+}
+
+template <FunctionKind Kind>
+void runRangeReductionBenchmark(uint32_t iterations, uint32_t numPartitions) {
+  folly::BenchmarkSuspender suspender;
+
+  std::vector<uint64_t> hashes(kSize);
+  std::vector<uint32_t> partitions(kSize);
+  for (vector_size_t row = 0; row < kSize; ++row) {
+    hashes[row] = (static_cast<uint64_t>(row * 8191) << 32) ^
+        static_cast<uint64_t>(row * 1315423911ULL + 17);
+  }
+
+  suspender.dismiss();
+
+  for (uint32_t iteration = 0; iteration < iterations; ++iteration) {
+    if constexpr (Kind == FunctionKind::kNormal) {
+      normalRangeReduction(
+          hashes.data(), partitions.data(), kSize, numPartitions);
+    } else {
+      rangeReduction(hashes.data(), partitions.data(), kSize, numPartitions);
+    }
+    folly::doNotOptimizeAway(partitions.data());
+  }
+}
+
+template <typename T, FunctionKind Kind>
+void runPartitionBenchmark(
+    uint32_t iterations,
+    PartitionMode partitionMode,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    int numPartitions) {
+  folly::BenchmarkSuspender suspender;
+
+  auto pool = memory::memoryManager()->addLeafPool();
+  VectorMaker vectorMaker(pool.get());
+  auto values = makeValuesVector<T>(
+      vectorMaker, pool.get(), encodingMode, nullMode, kSize);
+  auto input = vectorMaker.rowVector({values});
+  auto partitionFunction = makePartitionFunction<Kind>(
+      partitionMode, asRowType(input->type()), numPartitions);
+  std::vector<uint32_t> partitions;
+
+  suspender.dismiss();
+
+  for (uint32_t iteration = 0; iteration < iterations; ++iteration) {
+    std::optional<uint32_t> singlePartition =
+        partitionFunction->partition(*input, partitions);
+    if (singlePartition.has_value()) {
+      std::fill(partitions.begin(), partitions.end(), singlePartition.value());
+    }
+    folly::doNotOptimizeAway(partitions.data());
+  }
+}
+
+template <typename T>
+void benchmarkNormalHashPartitionFunction(
+    uint32_t iterations,
+    PartitionMode partitionMode,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    int numPartitions) {
+  runPartitionBenchmark<T, FunctionKind::kNormal>(
+      iterations, partitionMode, encodingMode, nullMode, numPartitions);
+}
+
+template <typename T>
+void benchmarkOptimizedHashPartitionFunction(
+    uint32_t iterations,
+    PartitionMode partitionMode,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    int numPartitions) {
+  runPartitionBenchmark<T, FunctionKind::kOptimized>(
+      iterations, partitionMode, encodingMode, nullMode, numPartitions);
+}
+
+#define REGISTER_PARTITION_PAIR(                                                                                  \
+    T,                                                                                                            \
+    TYPE_NAME,                                                                                                    \
+    PARTITION_MODE,                                                                                               \
+    PARTITION_NAME,                                                                                               \
+    NUM_PARTITIONS,                                                                                               \
+    NUM_PARTITIONS_NAME,                                                                                          \
+    ENCODING_MODE,                                                                                                \
+    ENCODING_NAME,                                                                                                \
+    NULL_MODE,                                                                                                    \
+    NULL_NAME)                                                                                                    \
+  BENCHMARK(                                                                                                      \
+      partition_##TYPE_NAME##_##PARTITION_NAME##_##NUM_PARTITIONS_NAME##_##ENCODING_NAME##_##NULL_NAME,           \
+      iterations) {                                                                                               \
+    benchmarkNormalHashPartitionFunction<T>(                                                                      \
+        iterations, PARTITION_MODE, ENCODING_MODE, NULL_MODE, NUM_PARTITIONS);                                    \
+  }                                                                                                               \
+  BENCHMARK_RELATIVE(                                                                                             \
+      optimized_partition_##TYPE_NAME##_##PARTITION_NAME##_##NUM_PARTITIONS_NAME##_##ENCODING_NAME##_##NULL_NAME, \
+      iterations) {                                                                                               \
+    benchmarkOptimizedHashPartitionFunction<T>(                                                                   \
+        iterations, PARTITION_MODE, ENCODING_MODE, NULL_MODE, NUM_PARTITIONS);                                    \
+  }                                                                                                               \
+  BENCHMARK_DRAW_LINE();
+
+#define REGISTER_PARTITION_NULL_MODES( \
+    T,                                 \
+    TYPE_NAME,                         \
+    PARTITION_MODE,                    \
+    PARTITION_NAME,                    \
+    NUM_PARTITIONS,                    \
+    NUM_PARTITIONS_NAME,               \
+    ENCODING_MODE,                     \
+    ENCODING_NAME)                     \
+  REGISTER_PARTITION_PAIR(             \
+      T,                               \
+      TYPE_NAME,                       \
+      PARTITION_MODE,                  \
+      PARTITION_NAME,                  \
+      NUM_PARTITIONS,                  \
+      NUM_PARTITIONS_NAME,             \
+      ENCODING_MODE,                   \
+      ENCODING_NAME,                   \
+      NullMode::kNoNulls,              \
+      no_null)                         \
+  REGISTER_PARTITION_PAIR(             \
+      T,                               \
+      TYPE_NAME,                       \
+      PARTITION_MODE,                  \
+      PARTITION_NAME,                  \
+      NUM_PARTITIONS,                  \
+      NUM_PARTITIONS_NAME,             \
+      ENCODING_MODE,                   \
+      ENCODING_NAME,                   \
+      NullMode::kHalfNulls,            \
+      half_null)                       \
+  REGISTER_PARTITION_PAIR(             \
+      T,                               \
+      TYPE_NAME,                       \
+      PARTITION_MODE,                  \
+      PARTITION_NAME,                  \
+      NUM_PARTITIONS,                  \
+      NUM_PARTITIONS_NAME,             \
+      ENCODING_MODE,                   \
+      ENCODING_NAME,                   \
+      NullMode::kAllNulls,             \
+      all_null)
+
+#define REGISTER_PARTITION_ENCODINGS( \
+    T,                                \
+    TYPE_NAME,                        \
+    PARTITION_MODE,                   \
+    PARTITION_NAME,                   \
+    NUM_PARTITIONS,                   \
+    NUM_PARTITIONS_NAME)              \
+  REGISTER_PARTITION_NULL_MODES(      \
+      T,                              \
+      TYPE_NAME,                      \
+      PARTITION_MODE,                 \
+      PARTITION_NAME,                 \
+      NUM_PARTITIONS,                 \
+      NUM_PARTITIONS_NAME,            \
+      EncodingMode::kFlat,            \
+      flat)                           \
+  REGISTER_PARTITION_NULL_MODES(      \
+      T,                              \
+      TYPE_NAME,                      \
+      PARTITION_MODE,                 \
+      PARTITION_NAME,                 \
+      NUM_PARTITIONS,                 \
+      NUM_PARTITIONS_NAME,            \
+      EncodingMode::kDictionary,      \
+      dictionary)                     \
+  REGISTER_PARTITION_NULL_MODES(      \
+      T,                              \
+      TYPE_NAME,                      \
+      PARTITION_MODE,                 \
+      PARTITION_NAME,                 \
+      NUM_PARTITIONS,                 \
+      NUM_PARTITIONS_NAME,            \
+      EncodingMode::kConstant,        \
+      constant)
+
+#define REGISTER_PARTITION_COUNTS(                                \
+    T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME)                 \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1, p1)        \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 4, p4)        \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 16, p16)      \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 100, p100)    \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1'000, p1000) \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1'024, p1024)
+
+#define REGISTER_PARTITION_MODES(T, TYPE_NAME)                            \
+  REGISTER_PARTITION_COUNTS(T, TYPE_NAME, PartitionMode::kRemote, remote) \
+  REGISTER_PARTITION_COUNTS(                                              \
+      T, TYPE_NAME, PartitionMode::kLocalExchange, local_exchange)        \
+  REGISTER_PARTITION_ENCODINGS(                                           \
+      T,                                                                  \
+      TYPE_NAME,                                                          \
+      PartitionMode::kHashBitRangeFirst8,                                 \
+      hashbits_0_8,                                                       \
+      0,                                                                  \
+      hashbits)                                                           \
+  REGISTER_PARTITION_ENCODINGS(                                           \
+      T,                                                                  \
+      TYPE_NAME,                                                          \
+      PartitionMode::kHashBitRangeLast8,                                  \
+      hashbits_last_8,                                                    \
+      0,                                                                  \
+      hashbits)
+
+REGISTER_PARTITION_MODES(bool, bool)
+REGISTER_PARTITION_MODES(int8_t, tinyint)
+REGISTER_PARTITION_MODES(int16_t, smallint)
+REGISTER_PARTITION_MODES(int32_t, integer)
+REGISTER_PARTITION_MODES(int64_t, bigint)
+REGISTER_PARTITION_MODES(StringView, varchar)
+
+#define REGISTER_RANGE_REDUCTION_PAIR(NUM_PARTITIONS, NUM_PARTITIONS_NAME) \
+  BENCHMARK(normal_range_reduction_##NUM_PARTITIONS_NAME, iterations) {    \
+    runRangeReductionBenchmark<FunctionKind::kNormal>(                     \
+        iterations, NUM_PARTITIONS);                                       \
+  }                                                                        \
+  BENCHMARK_RELATIVE(                                                      \
+      optimized_range_reduction_##NUM_PARTITIONS_NAME, iterations) {       \
+    runRangeReductionBenchmark<FunctionKind::kOptimized>(                  \
+        iterations, NUM_PARTITIONS);                                       \
+  }                                                                        \
+  BENCHMARK_DRAW_LINE();
+
+REGISTER_RANGE_REDUCTION_PAIR(1, p1)
+REGISTER_RANGE_REDUCTION_PAIR(4, p4)
+REGISTER_RANGE_REDUCTION_PAIR(16, p16)
+REGISTER_RANGE_REDUCTION_PAIR(100, p100)
+REGISTER_RANGE_REDUCTION_PAIR(1'000, p1000)
+REGISTER_RANGE_REDUCTION_PAIR(1'024, p1024)
+
+#undef REGISTER_PARTITION_MODES
+#undef REGISTER_PARTITION_COUNTS
+#undef REGISTER_PARTITION_ENCODINGS
+#undef REGISTER_PARTITION_NULL_MODES
+#undef REGISTER_PARTITION_PAIR
+#undef REGISTER_RANGE_REDUCTION_PAIR
+
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt
index 119fcecc0bd..189e7fc8680 100644
--- a/velox/exec/tests/CMakeLists.txt
+++ b/velox/exec/tests/CMakeLists.txt
@@ -65,6 +65,7 @@ set(
   EnforceDistinctTest.cpp
   TraceUtilTest.cpp
   HashPartitionFunctionTest.cpp
+  OptimizedHashPartitionFunctionTest.cpp
   SpatialIndexTest.cpp
   ValuesTest.cpp
   ParallelProjectTest.cpp
diff --git a/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp b/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp
new file mode 100644
index 00000000000..b9d6b193159
--- /dev/null
+++ b/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+
+class OptimizedHashPartitionFunctionTest : public velox::test::VectorTestBase,
+                                           public testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+  }
+};
+
+TEST_F(
+    OptimizedHashPartitionFunctionTest,
+    powerOfTwoRangeReductionMatchesMultiplyHigh) {
+  const std::vector<uint64_t> hashes = {
+      0,
+      1,
+      0x0000'0001'0000'0000ULL,
+      0x1234'5678'9abc'def0ULL,
+      0xffff'ffff'ffff'ffffULL,
+  };
+
+  for (const auto numPartitions : {1, 2, 4, 1'024}) {
+    std::vector<uint32_t> partitions(hashes.size());
+    rangeReduction(
+        hashes.data(),
+        partitions.data(),
+        static_cast<vector_size_t>(hashes.size()),
+        numPartitions);
+
+    std::vector<uint32_t> expected;
+    expected.reserve(hashes.size());
+    for (const auto hash : hashes) {
+      const auto mixedHash =
+          static_cast<uint32_t>(hash) ^ static_cast<uint32_t>(hash >> 32);
+      expected.push_back(
+          (static_cast<uint64_t>(mixedHash) * numPartitions) >> 32);
+    }
+
+    EXPECT_EQ(partitions, expected);
+  }
+}
+
+TEST_F(
+    OptimizedHashPartitionFunctionTest,
+    optimizedHashBitRangeMatchesRegular) {
+  const auto numRows = 10'000;
+  auto input = makeRowVector(
+      {makeNullableFlatVector<int64_t>([&] {
+         std::vector<std::optional<int64_t>> values;
+         values.reserve(numRows);
+         for (auto row = 0; row < numRows; ++row) {
+           values.emplace_back(
+               row % 17 == 0 ? std::nullopt : std::optional<int64_t>(row * 13));
+         }
+         return values;
+       }()),
+       makeFlatVector<StringView>(numRows, [](auto row) {
+         return StringView::makeInline(fmt::format("value_{}", row % 97));
+       })});
+  const auto rowType = asRowType(input->type());
+
+  HashPartitionFunction regular(HashBitRange{0, 5}, rowType, {0, 1});
+  OptimizedHashPartitionFunction optimized(HashBitRange{0, 5}, rowType, {0, 1});
+
+  std::vector<uint32_t> regularPartitions;
+  std::vector<uint32_t> optimizedPartitions;
+  EXPECT_EQ(
+      regular.partition(*input, regularPartitions),
+      optimized.partition(*input, optimizedPartitions));
+  EXPECT_EQ(regularPartitions, optimizedPartitions);
+}
+
+TEST_F(OptimizedHashPartitionFunctionTest, onePartitionReturnsConstantResult) {
+  auto input = makeRowVector({makeConstant(true, 10'000)});
+  const auto rowType = asRowType(input->type());
+  OptimizedHashPartitionFunction partitionFunction(
+      /*localExchange=*/true, 1, rowType, {0});
+
+  std::vector<uint32_t> partitions{123};
+  EXPECT_EQ(partitionFunction.partition(*input, partitions), 0u);
+  EXPECT_EQ(partitions, std::vector<uint32_t>{123});
+}
+
+TEST_F(OptimizedHashPartitionFunctionTest, emptyConstantKeyReturnsEmptyResult) {
+  auto input = makeRowVector({makeConstant(true, 0)});
+  const auto rowType = asRowType(input->type());
+  OptimizedHashPartitionFunction optimized(
+      /*localExchange=*/true, 16, rowType, {0});
+
+  std::vector<uint32_t> optimizedPartitions{123};
+  EXPECT_EQ(optimized.partition(*input, optimizedPartitions), std::nullopt);
+  EXPECT_TRUE(optimizedPartitions.empty());
+}
+
+TEST_F(OptimizedHashPartitionFunctionTest, specUsesConfiguredImplementation) {
+  auto input = makeRowVector(
+      {makeFlatVector<int32_t>({1, 2, 3, 4}),
+       makeFlatVector<StringView>({"a", "b", "c", "d"})});
+  const auto rowType = asRowType(input->type());
+  HashPartitionFunctionSpec spec(rowType, std::vector<column_index_t>{0, 1});
+  auto optimizedFunction = spec.create(8, /*localExchange=*/false, true);
+  ASSERT_NE(
+      dynamic_cast<OptimizedHashPartitionFunction*>(optimizedFunction.get()),
+      nullptr);
+
+  auto regularFunction = spec.create(8, /*localExchange=*/false);
+  ASSERT_NE(
+      dynamic_cast<HashPartitionFunction*>(regularFunction.get()), nullptr);
+
+  std::vector<uint32_t> optimizedPartitions;
+  ASSERT_EQ(
+      optimizedFunction->partition(*input, optimizedPartitions), std::nullopt);
+  ASSERT_EQ(optimizedPartitions.size(), input->size());
+  for (const auto partition : optimizedPartitions) {
+    EXPECT_LT(partition, 8);
+  }
+}
diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp
index 9117cba55ee..468ae20bf79 100644
--- a/velox/exec/tests/utils/PlanBuilder.cpp
+++ b/velox/exec/tests/utils/PlanBuilder.cpp
@@ -1689,7 +1689,8 @@ class RoundRobinRowPartitionFunctionSpec : public core::PartitionFunctionSpec {
  public:
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool /*localExchange*/) const override {
+      bool /*localExchange*/,
+      bool /*useOptimizedPartitionFunction*/ = false) const override {
     return std::make_unique<RoundRobinRowPartitionFunction>(numPartitions);
   }
 
diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.cpp b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
index 6c939dfb569..e9191ba0b8f 100644
--- a/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
+++ b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
@@ -98,8 +98,12 @@ std::vector<VectorPtr> PartitionedVectorTestBase::partitionRowVectors(
   std::vector<uint32_t> partitions(totalNumRows, 0);
   if (numPartitions > 1) {
     auto rowType = asRowType(mergedRowVector->type());
-    //    auto partitionFunction = createPartitionFunction(rowType, {0});
-    partitionFunction->partition(*mergedRowVector->as<RowVector>(), partitions);
+    std::optional<uint32_t> singlePartition = partitionFunction->partition(
+        *mergedRowVector->as<RowVector>(), partitions);
+    if (singlePartition.has_value()) {
+      // All rows go to the same partition
+      std::fill(partitions.begin(), partitions.end(), singlePartition.value());
+    }
   }
 
   std::vector<VectorPtr> partitionedVectors =