diff --git a/scripts/setup-common.sh b/scripts/setup-common.sh
index 9699939d2af..6673e432a16 100755
--- a/scripts/setup-common.sh
+++ b/scripts/setup-common.sh
@@ -48,7 +48,12 @@ function install_fmt {
 
 function install_folly {
   wget_and_untar https://github.com/facebook/folly/archive/refs/tags/"${FB_OS_VERSION}".tar.gz folly
-  local FOLLY_FLAGS=(-DBUILD_SHARED_LIBS="$VELOX_BUILD_SHARED" -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON)
+  local FOLLY_FLAGS=(
+    -DBUILD_SHARED_LIBS="$VELOX_BUILD_SHARED"
+    -DBUILD_TESTS=OFF
+    -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}"
+    -DFOLLY_HAVE_INT128_T=ON
+  )
   # When folly is static, use static gflags to avoid dual gflags flag
   # registration when .so plugins are dlopen'd (both the binary and plugin
   # would register the same flags in a shared gflags registry).
diff --git a/scripts/setup-helper-functions.sh b/scripts/setup-helper-functions.sh
index a50fb02ae0e..30bfb7d523d 100755
--- a/scripts/setup-helper-functions.sh
+++ b/scripts/setup-helper-functions.sh
@@ -81,7 +81,8 @@ function github_checkout {
 # The values that CPU_ARCH can take are as follows:
 #   arm64  : Target Apple silicon.
 #   aarch64: Target general 64 bit arm cpus.
-#   avx:     Target Intel CPUs with AVX.
+#   avx512:  Target Intel CPUs with AVX-512F.
+#   avx:     Target Intel CPUs with AVX2.
 #   sse:     Target Intel CPUs with sse.
 # Echo's the appropriate compiler flags which can be captured as so
 # CXX_FLAGS=$(get_cxx_flags) or
@@ -102,7 +103,9 @@ function get_cxx_flags {
       else # x86_64
         local CPU_CAPABILITIES
         CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}')
-        if [[ $CPU_CAPABILITIES =~ "avx" ]]; then
+        if [[ $CPU_CAPABILITIES =~ "avx512f" ]]; then
+          CPU_ARCH="avx512"
+        elif [[ $CPU_CAPABILITIES =~ "avx" ]]; then
           CPU_ARCH="avx"
         else
           CPU_ARCH="sse"
@@ -114,7 +117,9 @@ function get_cxx_flags {
       else # x86_64
         local CPU_CAPABILITIES
         CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1 | awk '{print tolower($0)}')
-        if [[ $CPU_CAPABILITIES =~ "avx" ]]; then
+        if [[ $CPU_CAPABILITIES =~ "avx512f" ]]; then
+          CPU_ARCH="avx512"
+        elif [[ $CPU_CAPABILITIES =~ "avx" ]]; then
           CPU_ARCH="avx"
         elif [[ $CPU_CAPABILITIES =~ "sse" ]]; then
           CPU_ARCH="sse"
@@ -131,8 +136,12 @@ function get_cxx_flags {
     echo -n "-mcpu=apple-m1+crc"
     ;;
 
+  "avx512")
+    echo -n "-mavx512f -mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2"
+    ;;
+
   "avx")
-    echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt  -mbmi2"
+    echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2"
     ;;
 
   "sse")
diff --git a/velox/common/process/ProcessBase.cpp b/velox/common/process/ProcessBase.cpp
index 0b9a4df2c64..3cbb7fa6a42 100644
--- a/velox/common/process/ProcessBase.cpp
+++ b/velox/common/process/ProcessBase.cpp
@@ -32,6 +32,8 @@ DECLARE_bool(avx2); // Enables use of AVX2 when available NOLINT
 
 DECLARE_bool(bmi2); // Enables use of BMI2 when available NOLINT
 
+DECLARE_bool(avx512f);
+
 namespace facebook {
 namespace velox {
 namespace process {
@@ -106,6 +108,7 @@ uint64_t threadCpuNanos() {
 namespace {
 bool bmi2CpuFlag = folly::CpuId().bmi2();
 bool avx2CpuFlag = folly::CpuId().avx2();
+bool avx512fCpuFlag = folly::CpuId().avx512f();
 } // namespace
 
 bool hasAvx2() {
@@ -124,6 +127,14 @@ bool hasBmi2() {
 #endif
 }
 
+bool hasAvx512f() {
+#ifdef __AVX512F__
+  return avx512fCpuFlag && FLAGS_avx512f;
+#else
+  return false;
+#endif
+}
+
 } // namespace process
 } // namespace velox
 } // namespace facebook
diff --git a/velox/common/process/ProcessBase.h b/velox/common/process/ProcessBase.h
index 34edd6d1467..7ca400b4efa 100644
--- a/velox/common/process/ProcessBase.h
+++ b/velox/common/process/ProcessBase.h
@@ -46,6 +46,10 @@ uint64_t threadCpuNanos();
 /// by flag.
 bool hasAvx2();
 
+/// True if the machine has Intel AVX512F instructions and these are not
+/// disabled by flag.
+bool hasAvx512f();
+
 /// True if the machine has Intel BMI2 instructions and these are not disabled
 /// by flag.
 bool hasBmi2();
diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp
index 062a507fc64..f52aeb7dd37 100644
--- a/velox/connectors/hive/HiveConnector.cpp
+++ b/velox/connectors/hive/HiveConnector.cpp
@@ -132,7 +132,8 @@ void HiveConnector::registerSerDe() {
 
 std::unique_ptr<core::PartitionFunction> HivePartitionFunctionSpec::create(
     int numPartitions,
-    bool localExchange) const {
+    bool localExchange,
+    bool /*useOptimizedPartitionFunction*/) const {
   std::vector<int> bucketToPartitions;
   if (bucketToPartition_.empty()) {
     // NOTE: if hive partition function spec doesn't specify bucket to partition
diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h
index 95c175c4f69..e3508cb4729 100644
--- a/velox/connectors/hive/HiveConnector.h
+++ b/velox/connectors/hive/HiveConnector.h
@@ -141,7 +141,8 @@ class HivePartitionFunctionSpec : public core::PartitionFunctionSpec {
 
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool localExchange) const override;
+      bool localExchange,
+      bool useOptimizedPartitionFunction = false) const override;
 
   std::string toString() const override;
 
diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h
index 4a1ba1579cd..109caf0f45d 100644
--- a/velox/core/PlanNode.h
+++ b/velox/core/PlanNode.h
@@ -2500,9 +2500,13 @@ class PartitionFunctionSpec : public ISerializable {
  public:
   /// If 'localExchange' is true, the partition function is used for local
   /// exchange within a velox task.
+  /// TODO: useOptimizedPartitionFunction = true is only supported in
+  /// HashPartitionFunction now. Will extend the optimization to other
+  /// PartitionFunctions soon.
   virtual std::unique_ptr<PartitionFunction> create(
       int numPartitions,
-      bool localExchange = false) const = 0;
+      bool localExchange = false,
+      bool useOptimizedPartitionFunction = false) const = 0;
 
   virtual ~PartitionFunctionSpec() = default;
 
@@ -2515,7 +2519,8 @@ class GatherPartitionFunctionSpec : public PartitionFunctionSpec {
  public:
   std::unique_ptr<PartitionFunction> create(
       int /*numPartitions*/,
-      bool /*localExchange*/) const override {
+      bool /*localExchange*/,
+      bool /*useOptimizedPartitionFunction*/ = false) const override {
     VELOX_UNREACHABLE();
   }
 
diff --git a/velox/core/QueryConfig.cpp b/velox/core/QueryConfig.cpp
index 4a31862590a..8493d6546c7 100644
--- a/velox/core/QueryConfig.cpp
+++ b/velox/core/QueryConfig.cpp
@@ -90,6 +90,7 @@ const std::vector<config::ConfigProperty>& QueryConfig::registeredProperties() {
 
     // Partitioned output.
     VELOX_REGISTER_QUERY_CONFIG(kPartitionedOutputEagerFlush);
+    VELOX_REGISTER_QUERY_CONFIG(kOptimizedHashPartitionFunctionEnabled);
     VELOX_REGISTER_QUERY_CONFIG(kMaxPartitionedOutputBufferSize);
     VELOX_REGISTER_QUERY_CONFIG(kMaxOutputBufferSize);
 
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
index 772015daa1f..b30fb47bd1a 100644
--- a/velox/core/QueryConfig.h
+++ b/velox/core/QueryConfig.h
@@ -454,6 +454,16 @@ class QueryConfig {
       false,
       "Flush PartitionedOutput rows eagerly without buffering.")
 
+  /// If true, use OptimizedHashPartitionFunction in place of
+  /// HashPartitionFunction.
+  VELOX_QUERY_CONFIG(
+      kOptimizedHashPartitionFunctionEnabled,
+      optimizedHashPartitionFunctionEnabled,
+      "optimized_hash_partition_function_enabled",
+      bool,
+      false,
+      "Use OptimizedHashPartitionFunction instead of HashPartitionFunction.")
+
   /// The maximum number of bytes to buffer in PartitionedOutput operator to
   /// avoid creating tiny SerializedPages.
   VELOX_QUERY_CONFIG(
@@ -1469,6 +1479,14 @@ class QueryConfig {
       1000,
       "Batch size threshold for zero-copy in MarkSorted operator.")
 
+  VELOX_QUERY_CONFIG(
+      kOptimizedPartitionedOutputEnabled,
+      optimizedPartitionedOutputEnabled,
+      "optimized_repartitioning",
+      bool,
+      false,
+      "Enable OptimizedPartitionedOutput operator.");
+
   // --- Hand-written accessors for properties that need custom logic ---
 
   // Generated by VELOX_QUERY_CONFIG for simple properties above.
diff --git a/velox/exec/CMakeLists.txt b/velox/exec/CMakeLists.txt
index 3a5bec7e6e8..626c7c06570 100644
--- a/velox/exec/CMakeLists.txt
+++ b/velox/exec/CMakeLists.txt
@@ -71,6 +71,9 @@ velox_add_library(
   OperatorTraceScan.cpp
   OperatorTraceWriter.cpp
   OperatorUtils.cpp
+  OptimizedHashPartitionFunction.cpp
+  OptimizedPartitionedOutput.cpp
+  OptimizedVectorHasher.cpp
   OrderBy.cpp
   OutputBuffer.cpp
   OutputBufferManager.cpp
@@ -177,6 +180,7 @@ velox_add_library(
   OperatorTraceWriter.h
   OperatorType.h
   OperatorUtils.h
+  OptimizedVectorHasher.h
   OrderBy.h
   OutputBuffer.h
   OutputBufferManager.h
diff --git a/velox/exec/HashPartitionFunction.cpp b/velox/exec/HashPartitionFunction.cpp
index 896facc4efa..44f012e5e00 100644
--- a/velox/exec/HashPartitionFunction.cpp
+++ b/velox/exec/HashPartitionFunction.cpp
@@ -13,8 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <velox/exec/HashPartitionFunction.h>
-#include <velox/exec/VectorHasher.h>
+#include "velox/exec/HashPartitionFunction.h"
+
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+#include "velox/exec/VectorHasher.h"
 
 #define XXH_INLINE_ALL
 #include <xxhash.h> // @manual=third-party//xxHash:xxhash
@@ -123,9 +125,15 @@ std::optional<uint32_t> HashPartitionFunction::partition(
 
 std::unique_ptr<core::PartitionFunction> HashPartitionFunctionSpec::create(
     int numPartitions,
-    bool localExchange) const {
-  return std::make_unique<exec::HashPartitionFunction>(
-      localExchange, numPartitions, inputType_, keyChannels_, constValues_);
+    bool localExchange,
+    bool useOptimizedPartitionFunction) const {
+  return createHashPartitionFunction(
+      localExchange,
+      numPartitions,
+      inputType_,
+      keyChannels_,
+      constValues_,
+      useOptimizedPartitionFunction);
 }
 
 std::string HashPartitionFunctionSpec::toString() const {
@@ -180,4 +188,33 @@ core::PartitionFunctionSpecPtr HashPartitionFunctionSpec::deserialize(
   return std::make_shared<HashPartitionFunctionSpec>(
       ISerializable::deserialize<RowType>(obj["inputType"]), keys, constValues);
 }
+
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    bool localExchange,
+    int numPartitions,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues,
+    bool useOptimizedPartitionFunction) {
+  if (useOptimizedPartitionFunction) {
+    return std::make_unique<OptimizedHashPartitionFunction>(
+        localExchange, numPartitions, inputType, keyChannels, constValues);
+  }
+  return std::make_unique<HashPartitionFunction>(
+      localExchange, numPartitions, inputType, keyChannels, constValues);
+}
+
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    const HashBitRange& hashBitRange,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues,
+    bool useOptimizedPartitionFunction) {
+  if (useOptimizedPartitionFunction) {
+    return std::make_unique<OptimizedHashPartitionFunction>(
+        hashBitRange, inputType, keyChannels, constValues);
+  }
+  return std::make_unique<HashPartitionFunction>(
+      hashBitRange, inputType, keyChannels, constValues);
+}
 } // namespace facebook::velox::exec
diff --git a/velox/exec/HashPartitionFunction.h b/velox/exec/HashPartitionFunction.h
index 7aa6a032d6b..848fd42e0ac 100644
--- a/velox/exec/HashPartitionFunction.h
+++ b/velox/exec/HashPartitionFunction.h
@@ -15,19 +15,28 @@
  */
 #pragma once
 
-#include <velox/exec/HashBitRange.h>
-#include <velox/exec/VectorHasher.h>
 #include "velox/core/PlanNode.h"
+#include "velox/exec/HashBitRange.h"
+#include "velox/exec/VectorHasher.h"
 
 namespace facebook::velox::exec {
 
+class HashPartitionFunctionBase : public core::PartitionFunction {
+ public:
+  ~HashPartitionFunctionBase() override = default;
+
+  virtual int numPartitions() const = 0;
+};
+
 /// Calculates partition number for each row of the specified vector using a
 /// hash function. The constructor with hashBitRange parameter requires both
 /// hashBitRange and keyChannels to be non-empty. The constructor with
 /// numPartitions allows the keyChannels argument to be empty. If keyChannels is
 /// empty, then the resulting partition number of partition() will always be
 /// zero.
-class HashPartitionFunction : public core::PartitionFunction {
+/// Extends PartitionFunction with access to the configured number of
+/// partitions.
+class HashPartitionFunction : public HashPartitionFunctionBase {
  public:
   HashPartitionFunction(
       bool localExchange,
@@ -48,7 +57,7 @@ class HashPartitionFunction : public core::PartitionFunction {
       const RowVector& input,
       std::vector<uint32_t>& partitions) override;
 
-  int numPartitions() const {
+  int numPartitions() const override {
     return numPartitions_;
   }
 
@@ -85,7 +94,8 @@ class HashPartitionFunctionSpec : public core::PartitionFunctionSpec {
 
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool localExchange) const override;
+      bool localExchange,
+      bool useOptimizedPartitionFunction = false) const override;
 
   std::string toString() const override;
 
@@ -100,4 +110,22 @@ class HashPartitionFunctionSpec : public core::PartitionFunctionSpec {
   const std::vector<column_index_t> keyChannels_;
   const std::vector<VectorPtr> constValues_;
 };
+
+/// Creates either HashPartitionFunction or OptimizedHashPartitionFunction
+/// based on 'useOptimizedPartitionFunction'.
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    bool localExchange,
+    int numPartitions,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues = {},
+    bool useOptimizedPartitionFunction = false);
+
+std::unique_ptr<HashPartitionFunctionBase> createHashPartitionFunction(
+    const HashBitRange& hashBitRange,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues = {},
+    bool useOptimizedPartitionFunction = false);
+
 } // namespace facebook::velox::exec
diff --git a/velox/exec/LocalPartition.cpp b/velox/exec/LocalPartition.cpp
index eb6eb81add3..231b873d7fa 100644
--- a/velox/exec/LocalPartition.cpp
+++ b/velox/exec/LocalPartition.cpp
@@ -339,10 +339,13 @@ LocalPartition::LocalPartition(
           ctx->task->getLocalExchangeQueues(ctx->splitGroupId, planNode->id())},
       numPartitions_{queues_.size()},
       partitionFunction_(
-          numPartitions_ == 1 ? nullptr
-                              : planNode->partitionFunctionSpec().create(
-                                    numPartitions_,
-                                    /*localExchange=*/true)),
+          numPartitions_ == 1
+              ? nullptr
+              : planNode->partitionFunctionSpec().create(
+                    numPartitions_,
+                    /*localExchange=*/true,
+                    ctx->queryConfig()
+                        .optimizedHashPartitionFunctionEnabled())),
       singlePartitionBufferSize_{
           (numPartitions_ <
                ctx->queryConfig()
diff --git a/velox/exec/LocalPlanner.cpp b/velox/exec/LocalPlanner.cpp
index 39f009fe39a..a46daa8b4f1 100644
--- a/velox/exec/LocalPlanner.cpp
+++ b/velox/exec/LocalPlanner.cpp
@@ -37,6 +37,7 @@
 #include "velox/exec/NestedLoopJoinBuild.h"
 #include "velox/exec/NestedLoopJoinProbe.h"
 #include "velox/exec/OperatorTraceScan.h"
+#include "velox/exec/OptimizedPartitionedOutput.h"
 #include "velox/exec/OrderBy.h"
 #include "velox/exec/ParallelProject.h"
 #include "velox/exec/PartitionedOutput.h"
@@ -553,9 +554,15 @@ std::shared_ptr<Driver> DriverFactory::createDriver(
         auto partitionedOutputNode =
             std::dynamic_pointer_cast<const core::PartitionedOutputNode>(
                 planNode)) {
-      operators.push_back(
-          std::make_unique<PartitionedOutput>(
-              id, ctx.get(), partitionedOutputNode, eagerFlush(*planNode)));
+      if (ctx->queryConfig().optimizedPartitionedOutputEnabled()) {
+        operators.push_back(
+            std::make_unique<OptimizedPartitionedOutput>(
+                id, ctx.get(), partitionedOutputNode));
+      } else {
+        operators.push_back(
+            std::make_unique<PartitionedOutput>(
+                id, ctx.get(), partitionedOutputNode, eagerFlush(*planNode)));
+      }
     } else if (
         auto joinNode =
             std::dynamic_pointer_cast<const core::HashJoinNode>(planNode)) {
diff --git a/velox/exec/MarkDistinct.cpp b/velox/exec/MarkDistinct.cpp
index 2b562c714af..83ae15a2cad 100644
--- a/velox/exec/MarkDistinct.cpp
+++ b/velox/exec/MarkDistinct.cpp
@@ -356,8 +356,14 @@ void MarkDistinct::setupInputSpiller(
       &spillConfig_.value(),
       spillStats_.get());
 
-  spillHashFunction_ = std::make_unique<HashPartitionFunction>(
-      inputSpiller_->hashBits(), inputType_, distinctKeyChannels_);
+  spillHashFunction_ = createHashPartitionFunction(
+      inputSpiller_->hashBits(),
+      inputType_,
+      distinctKeyChannels_,
+      {},
+      operatorCtx_->driverCtx()
+          ->queryConfig()
+          .optimizedHashPartitionFunctionEnabled());
 }
 
 void MarkDistinct::spill() {
diff --git a/velox/exec/MarkDistinct.h b/velox/exec/MarkDistinct.h
index c8c582b5ea8..f386ff77bd9 100644
--- a/velox/exec/MarkDistinct.h
+++ b/velox/exec/MarkDistinct.h
@@ -106,7 +106,7 @@ class MarkDistinct : public Operator {
 
   SpillPartitionSet spillInputPartitionSet_;
 
-  std::unique_ptr<HashPartitionFunction> spillHashFunction_;
+  std::unique_ptr<HashPartitionFunctionBase> spillHashFunction_;
 
   SpillPartitionSet spillHashTablePartitionSet_;
 
diff --git a/velox/exec/OptimizedHashPartitionFunction.cpp b/velox/exec/OptimizedHashPartitionFunction.cpp
new file mode 100644
index 00000000000..ac83b7d8d27
--- /dev/null
+++ b/velox/exec/OptimizedHashPartitionFunction.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+
+#include <algorithm>
+
+#include <folly/Portability.h>
+
+#include "velox/common/process/ProcessBase.h"
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+#define XXH_INLINE_ALL
+#include <xxhash.h> // @manual=third-party//xxHash:xxhash
+
+namespace facebook::velox::exec {
+namespace {
+// Gets the hash value for local exchange with given 'rawHash'. 'rawHash'
+// is the value computed by this hash function which is used for remote
+// shuffle across stages like for Prestissimo.
+static inline uint32_t localExchangeHash(uint32_t rawHash) {
+  // Mix the bits so we don't use the same hash used to distribute between
+  // stages.
+  bits::reverseBits(reinterpret_cast<uint8_t*>(&rawHash), sizeof(rawHash));
+  return XXH32(&rawHash, sizeof(rawHash), 0);
+}
+
+FOLLY_ALWAYS_INLINE uint32_t mixedHash(uint64_t hash) {
+  return static_cast<uint32_t>(hash) ^ static_cast<uint32_t>(hash >> 32);
+}
+
+FOLLY_ALWAYS_INLINE uint32_t
+reduceRange(uint64_t hash, uint32_t numPartitions) {
+  return (static_cast<uint64_t>(mixedHash(hash)) * numPartitions) >> 32;
+}
+
+void rangeReductionPowerOfTwo(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  VELOX_DCHECK(bits::isPowerOfTwo(numPartitions));
+
+  if (numPartitions == 1) {
+    std::fill(partitions, partitions + size, 0);
+    return;
+  }
+
+  const auto shift = 32 - __builtin_ctz(numPartitions);
+  for (vector_size_t index = 0; index < size; ++index) {
+    partitions[index] = mixedHash(hashes[index]) >> shift;
+  }
+}
+
+#if defined(__AVX512F__)
+void rangeReductionAvx512(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  const __m512i numPartitionsVec = _mm512_set1_epi64(numPartitions);
+
+  vector_size_t index = 0;
+  for (; index + 8 <= size; index += 8) {
+    const auto hashesVec =
+        _mm512_loadu_si512(reinterpret_cast<const __m512i*>(hashes + index));
+
+    const auto mixedHashesVec =
+        _mm512_xor_si512(hashesVec, _mm512_srli_epi64(hashesVec, 32));
+    const auto productVec = _mm512_mul_epu32(mixedHashesVec, numPartitionsVec);
+    const auto shiftedVec = _mm512_srli_epi64(productVec, 32);
+    const auto packedResults = _mm512_cvtepi64_epi32(shiftedVec);
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(partitions + index), packedResults);
+  }
+
+  for (; index < size; ++index) {
+    partitions[index] = reduceRange(hashes[index], numPartitions);
+  }
+}
+#endif
+
+#if defined(__AVX2__)
+void rangeReductionAvx2(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  const auto packIndexes = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
+  const auto numPartitionsVec = _mm256_set1_epi64x(numPartitions);
+
+  vector_size_t index = 0;
+  for (; index + 4 <= size; index += 4) {
+    const auto hashesVec =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(hashes + index));
+    const auto mixedHashesVec =
+        _mm256_xor_si256(hashesVec, _mm256_srli_epi64(hashesVec, 32));
+    const auto productVec = _mm256_mul_epu32(mixedHashesVec, numPartitionsVec);
+    const auto shiftedVec = _mm256_srli_epi64(productVec, 32);
+    const auto packedResults =
+        _mm256_permutevar8x32_epi32(shiftedVec, packIndexes);
+    _mm_storeu_si128(
+        reinterpret_cast<__m128i*>(partitions + index),
+        _mm256_castsi256_si128(packedResults));
+  }
+
+  for (; index < size; ++index) {
+    partitions[index] = reduceRange(hashes[index], numPartitions);
+  }
+}
+#endif
+
+void rangeReductionImpl(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  if (bits::isPowerOfTwo(numPartitions)) {
+    rangeReductionPowerOfTwo(hashes, partitions, size, numPartitions);
+    return;
+  }
+
+#if defined(__AVX512F__)
+  if (process::hasAvx512f()) {
+    rangeReductionAvx512(hashes, partitions, size, numPartitions);
+    return;
+  }
+#endif
+
+#if defined(__AVX2__)
+  if (process::hasAvx2()) {
+    rangeReductionAvx2(hashes, partitions, size, numPartitions);
+    return;
+  }
+#endif
+
+  for (vector_size_t index = 0; index < size; ++index) {
+    partitions[index] = reduceRange(hashes[index], numPartitions);
+  }
+}
+
+void applyLocalExchangeHash(raw_vector<uint64_t>& hashes) {
+  for (auto& hash : hashes) {
+    hash = localExchangeHash(hash);
+  }
+}
+
+void applyHashBitRange(
+    const HashBitRange& hashBitRange,
+    const raw_vector<uint64_t>& hashes,
+    std::vector<uint32_t>& partitions) {
+  partitions.resize(hashes.size());
+  for (auto index = 0; index < hashes.size(); ++index) {
+    partitions[index] = hashBitRange.partition(hashes[index]);
+  }
+}
+
+} // namespace
+
+void rangeReduction(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions) {
+  rangeReductionImpl(hashes, partitions, size, numPartitions);
+}
+
+OptimizedHashPartitionFunction::OptimizedHashPartitionFunction(
+    bool localExchange,
+    int numPartitions,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues)
+    : localExchange_{localExchange}, numPartitions_{numPartitions} {
+  init(inputType, keyChannels, constValues);
+}
+
+OptimizedHashPartitionFunction::OptimizedHashPartitionFunction(
+    const HashBitRange& hashBitRange,
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues)
+    : localExchange_{false},
+      numPartitions_{hashBitRange.numPartitions()},
+      hashBitRange_(hashBitRange) {
+  VELOX_CHECK_GT(hashBitRange.numPartitions(), 0);
+  VELOX_CHECK(!keyChannels.empty());
+  init(inputType, keyChannels, constValues);
+}
+
+std::optional<uint32_t> OptimizedHashPartitionFunction::partition(
+    const RowVector& input,
+    std::vector<uint32_t>& partitions) {
+  if (hashers_.empty()) {
+    return 0u;
+  }
+
+  const auto size = input.size();
+  if (size == 0) {
+    partitions.clear();
+    return std::nullopt;
+  }
+
+  if (!hashBitRange_.has_value() && numPartitions_ == 1) {
+    return 0u;
+  }
+
+  rows_.resize(size);
+  rows_.setAll();
+
+  hashes_.resize(size);
+  for (auto i = 0; i < hashers_.size(); ++i) {
+    auto& hasher = hashers_[i];
+    if (hasher->channel() != kConstantChannel) {
+      hashers_[i]->decode(*input.childAt(hasher->channel()), rows_);
+      hashers_[i]->hash(rows_, i > 0, hashes_);
+    } else {
+      hashers_[i]->hashPrecomputed(i > 0, hashes_);
+    }
+  }
+
+  if (localExchange_) {
+    applyLocalExchangeHash(hashes_);
+  }
+
+  if (hashBitRange_.has_value()) {
+    applyHashBitRange(*hashBitRange_, hashes_, partitions);
+  } else {
+    partitions.resize(size);
+    rangeReduction(hashes_.data(), partitions.data(), size, numPartitions_);
+  }
+
+  return std::nullopt;
+}
+
+void OptimizedHashPartitionFunction::init(
+    const RowTypePtr& inputType,
+    const std::vector<column_index_t>& keyChannels,
+    const std::vector<VectorPtr>& constValues) {
+  hashers_.reserve(keyChannels.size());
+  size_t constChannel{0};
+  for (const auto channel : keyChannels) {
+    if (channel != kConstantChannel) {
+      hashers_.emplace_back(
+          OptimizedVectorHasher::create(inputType->childAt(channel), channel));
+    } else {
+      const auto& constValue = constValues[constChannel++];
+      hashers_.emplace_back(
+          OptimizedVectorHasher::create(constValue->type(), channel));
+      hashers_.back()->precompute(*constValue);
+    }
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedHashPartitionFunction.h b/velox/exec/OptimizedHashPartitionFunction.h
new file mode 100644
index 00000000000..bc7ceb1ac0b
--- /dev/null
+++ b/velox/exec/OptimizedHashPartitionFunction.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/exec/HashPartitionFunction.h"
+#include "velox/exec/OptimizedVectorHasher.h"
+
+namespace facebook::velox::exec {
+
+/// Maps hashes to partitions using range reduction. Visible for testing.
+void rangeReduction(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    vector_size_t size,
+    uint32_t numPartitions);
+
+/// Calculates partition numbers using OptimizedVectorHasher.
+class OptimizedHashPartitionFunction : public HashPartitionFunctionBase {
+ public:
+  OptimizedHashPartitionFunction(
+      bool localExchange,
+      int numPartitions,
+      const RowTypePtr& inputType,
+      const std::vector<column_index_t>& keyChannels,
+      const std::vector<VectorPtr>& constValues = {});
+
+  OptimizedHashPartitionFunction(
+      const HashBitRange& hashBitRange,
+      const RowTypePtr& inputType,
+      const std::vector<column_index_t>& keyChannels,
+      const std::vector<VectorPtr>& constValues = {});
+
+  ~OptimizedHashPartitionFunction() override = default;
+
+  std::optional<uint32_t> partition(
+      const RowVector& input,
+      std::vector<uint32_t>& partitions) override;
+
+  int numPartitions() const override {
+    return numPartitions_;
+  }
+
+ private:
+  void init(
+      const RowTypePtr& inputType,
+      const std::vector<column_index_t>& keyChannels,
+      const std::vector<VectorPtr>& constValues);
+
+  const bool localExchange_;
+  const int numPartitions_;
+  const std::optional<HashBitRange> hashBitRange_ = std::nullopt;
+  std::vector<std::unique_ptr<OptimizedVectorHasher>> hashers_;
+
+  // Reusable memory.
+  SelectivityVector rows_;
+  raw_vector<uint64_t> hashes_;
+};
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedPartitionedOutput.cpp b/velox/exec/OptimizedPartitionedOutput.cpp
new file mode 100644
index 00000000000..a8da3786b81
--- /dev/null
+++ b/velox/exec/OptimizedPartitionedOutput.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/OptimizedPartitionedOutput.h"
+
+#include <unordered_map>
+
+#include "velox/exec/HashPartitionFunction.h"
+#include "velox/exec/SerializedPage.h"
+#include "velox/exec/Task.h"
+
+namespace facebook::velox::exec {
+
+OptimizedPartitionedOutput::OptimizedPartitionedOutput(
+    int32_t operatorId,
+    DriverCtx* ctx,
+    const std::shared_ptr<const core::PartitionedOutputNode>& planNode)
+    : Operator(
+          ctx,
+          planNode->outputType(),
+          operatorId,
+          planNode->id(),
+          "OptimizedPartitionedOutput"),
+      taskId_(operatorCtx_->taskId()),
+      inputType_(planNode->inputType()),
+      keyChannels_(toChannels(planNode->inputType(), planNode->keys())),
+      outputChannels_(calculateOutputChannels(
+          planNode->inputType(),
+          planNode->outputType(),
+          planNode->outputType())),
+      numDestinations_(planNode->numPartitions()),
+      replicateNullsAndAny_(planNode->isReplicateNullsAndAny()),
+      bufferManager_(OutputBufferManager::getInstanceRef()),
+      // NOTE: 'bufferReleaseFn_' holds a reference on the associated task to
+      // prevent it from deleting while there are output buffers being accessed
+      // out of the partitioned output buffer manager such as in Prestissimo,
+      // the http server holds the buffers while sending the data response.
+      bufferReleaseFn_([task = operatorCtx_->task()]() {}),
+      maxOutputBufferBytes_(ctx->task->queryCtx()
+                                ->queryConfig()
+                                .maxPartitionedOutputBufferSize()),
+      pool_(pool()),
+      partitionFunction_(
+          numDestinations_ == 1 ? nullptr
+                                : planNode->partitionFunctionSpec().create(
+                                      numDestinations_,
+                                      /*localExchange=*/false,
+                                      true)) {
+  if (!planNode->isPartitioned()) {
+    VELOX_USER_CHECK_EQ(numDestinations_, 1);
+  }
+  if (numDestinations_ == 1) {
+    VELOX_USER_CHECK(keyChannels_.empty());
+  }
+
+  serializer::presto::SerdeOpts options;
+  options.compressionKind = common::stringToCompressionKind(
+      operatorCtx_->driverCtx()->queryConfig().shuffleCompressionKind());
+  options.minCompressionRatio = 0.8;
+
+  initializeSerializerLayout();
+
+  serializer_ = std::make_unique<
+      serializer::presto::PrestoIterativePartitioningSerializer>(
+      outputType_,
+      numDestinations_,
+      options,
+      pool_,
+      serializerInputByOutput_,
+      [bufferManager =
+           bufferManager_]() -> std::unique_ptr<OutputStreamListener> {
+        auto lockedBufferManager = bufferManager.lock();
+        VELOX_CHECK_NOT_NULL(
+            lockedBufferManager, "OutputBufferManager was already destructed");
+        return lockedBufferManager->newListener();
+      });
+}
+
+void OptimizedPartitionedOutput::addInput(RowVectorPtr input) {
+  VELOX_USER_CHECK(
+      !replicateNullsAndAny_,
+      "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput");
+
+  auto serializerInput = prepareSerializerInput(input);
+
+  if (serializer_->estimateBytesAfterAppend(serializerInput) >
+      maxOutputBufferBytes_) {
+    flush();
+  }
+
+  const auto numRows = input->size();
+  partitions_.resize(numRows);
+
+  if (numDestinations_ == 1) {
+    std::fill(partitions_.begin(), partitions_.end(), 0u);
+  } else {
+    std::optional<uint32_t> partition =
+        partitionFunction_->partition(*input, partitions_);
+    if (partition.has_value()) {
+      // All rows go to the same partition
+      std::fill(partitions_.begin(), partitions_.end(), partition.value());
+    }
+  }
+
+  serializer_->append(serializerInput, partitions_);
+
+  auto lockedStats = stats_.wlock();
+  ++numAppends_;
+  lockedStats->addRuntimeStat("numAppends", RuntimeCounter(1));
+}
+
+bool OptimizedPartitionedOutput::needsInput() const {
+  return blockingReason_ == BlockingReason::kNotBlocked;
+}
+
+RowVectorPtr OptimizedPartitionedOutput::getOutput() {
+  if (finished_) {
+    return nullptr;
+  }
+
+  blockingReason_ = BlockingReason::kNotBlocked;
+
+  if (noMoreInput_ || serializer_->bytesBuffered() >= maxOutputBufferBytes_) {
+    flush();
+  }
+
+  // If blocked, stop here. We avoid advancing operator state while blocked,
+  // even if noMoreInput_ may already be true. The driver will resume and call
+  // getOutput() again once the OutputBuffer has space.
+  if (blockingReason_ != BlockingReason::kNotBlocked) {
+    return nullptr;
+  }
+
+  if (noMoreInput_ && serializer_->bytesBuffered() == 0) {
+    // TODO: merge serializer runtime stats into operator stats once
+    // PrestoIterativePartitioningSerializer exposes runtimeStats().
+    bufferManager_.lock()->noMoreData(operatorCtx_->task()->taskId());
+    finished_ = true;
+  }
+
+  return nullptr;
+}
+
+BlockingReason OptimizedPartitionedOutput::isBlocked(ContinueFuture* future) {
+  if (blockingReason_ != BlockingReason::kNotBlocked) {
+    *future = std::move(future_);
+    blockingReason_ = BlockingReason::kNotBlocked;
+    return BlockingReason::kWaitForConsumer;
+  }
+  return BlockingReason::kNotBlocked;
+}
+
+bool OptimizedPartitionedOutput::isFinished() {
+  return finished_;
+}
+
+void OptimizedPartitionedOutput::initializeSerializerLayout() {
+  if (outputType_->size() == 0 || outputChannels_.empty()) {
+    serializerInputType_ = outputType_;
+    return;
+  }
+
+  std::unordered_map<column_index_t, column_index_t> outputToSerializerInput;
+  outputToSerializerInput.reserve(outputChannels_.size());
+
+  std::vector<std::string> names;
+  std::vector<TypePtr> types;
+  names.reserve(outputChannels_.size());
+  types.reserve(outputChannels_.size());
+  serializerInputByOutput_.reserve(outputChannels_.size());
+
+  for (const auto outputChannel : outputChannels_) {
+    auto it = outputToSerializerInput.find(outputChannel);
+    if (it == outputToSerializerInput.end()) {
+      const auto serializerInputChannel =
+          static_cast<column_index_t>(serializerInputChannels_.size());
+      serializerInputChannels_.push_back(outputChannel);
+      names.push_back(inputType_->nameOf(outputChannel));
+      types.push_back(inputType_->childAt(outputChannel));
+      it =
+          outputToSerializerInput.emplace(outputChannel, serializerInputChannel)
+              .first;
+    }
+    serializerInputByOutput_.push_back(it->second);
+  }
+
+  serializerInputType_ = ROW(std::move(names), std::move(types));
+}
+
+RowVectorPtr OptimizedPartitionedOutput::prepareSerializerInput(
+    const RowVectorPtr& input) const {
+  VELOX_CHECK_NOT_NULL(input);
+
+  if (serializerInputType_->size() == 0) {
+    return std::make_shared<RowVector>(
+        input->pool(),
+        serializerInputType_,
+        nullptr /*nulls*/,
+        input->size(),
+        std::vector<VectorPtr>{});
+  }
+
+  if (serializerInputChannels_.empty()) {
+    input->loadedVector();
+    return input;
+  }
+
+  std::vector<VectorPtr> serializerInputColumns;
+  serializerInputColumns.reserve(serializerInputChannels_.size());
+  for (auto channel : serializerInputChannels_) {
+    auto loadedChild = BaseVector::loadedVectorShared(input->childAt(channel));
+    serializerInputColumns.push_back(loadedChild);
+  }
+
+  return std::make_shared<RowVector>(
+      input->pool(),
+      serializerInputType_,
+      nullptr /*nulls*/,
+      input->size(),
+      std::move(serializerInputColumns));
+}
+
+void OptimizedPartitionedOutput::flush() {
+  const auto flushedBytes = serializer_->bytesBuffered();
+  const auto flushedRows = serializer_->rowsBuffered();
+
+  // This will serialize all destinations and reset serializer_->bytesBuffered()
+  // to 0.
+  auto serializedIOBufs = serializer_->flush();
+  auto bufferManager = bufferManager_.lock();
+  VELOX_CHECK_NOT_NULL(
+      bufferManager, "OutputBufferManager was already destructed");
+
+  bool shouldBlock = false;
+  ContinueFuture future = ContinueFuture::makeEmpty();
+  for (auto& [destination, pageData] : serializedIOBufs) {
+    // We will only pass the future to bufferManager->enqueue() for the first
+    // blocked destination. This is to avoid unnecessary creation of
+    // ContinueFuture objects for the remaining destinations.
+    ContinueFuture* futurePtr = shouldBlock ? nullptr : &future;
+
+    // Enqueue the data for each non-empty partition. Since the pageData is
+    // already serialized, enqueueing them would not cause new memory
+    // allocations. This will always move the pageData to the OutputBuffers no
+    // matter if the OutputBuffer is blocked.
+    bool blocked = bufferManager->enqueue(
+        taskId_,
+        static_cast<int>(destination),
+        std::make_unique<PrestoSerializedPage>(
+            std::move(pageData.first),
+            [fn = bufferReleaseFn_](folly::IOBuf&) { fn(); },
+            pageData.second),
+        futurePtr);
+
+    if (blocked && !shouldBlock) {
+      blockingReason_ = BlockingReason::kWaitForConsumer;
+      shouldBlock = true;
+      future_ = std::move(future);
+    }
+  }
+
+  auto lockedStats = stats_.wlock();
+  lockedStats->addOutputVector(flushedBytes, flushedRows);
+  if (flushedRows > 0) {
+    ++numFlushes_;
+    lockedStats->addRuntimeStat("numFlushes", RuntimeCounter(1));
+  }
+  if (shouldBlock) {
+    ++numBlockedTimes_;
+    lockedStats->addRuntimeStat("numBlockedTimes", RuntimeCounter(1));
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedPartitionedOutput.h b/velox/exec/OptimizedPartitionedOutput.h
new file mode 100644
index 00000000000..78ddcaf4a6f
--- /dev/null
+++ b/velox/exec/OptimizedPartitionedOutput.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/exec/Operator.h"
+#include "velox/exec/OutputBufferManager.h"
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+
+namespace facebook::velox::exec {
+
+/// Partitioned output operator backed by PrestoIterativePartitioningSerializer.
+///
+/// Routes each input row to a partition via a hash function, buffers the
+/// partitioned data, and flushes serialized Presto pages into the output
+/// buffer manager when the buffer is full or the pipeline is draining.
+class OptimizedPartitionedOutput : public Operator {
+ public:
+  /// Minimum flush size for non-final flush; 60 KB + overhead fits a 64 KB
+  /// network MTU.
+  static constexpr uint64_t kMinDestinationSize = 60 * 1024;
+
+  OptimizedPartitionedOutput(
+      int32_t operatorId,
+      DriverCtx* ctx,
+      const std::shared_ptr<const core::PartitionedOutputNode>& planNode);
+
+  void addInput(RowVectorPtr input) override;
+
+  /// Returns true when the operator is not waiting for the output buffer to
+  /// drain. The driver checks this before calling addInput() so a blocked
+  /// state does not accumulate additional rows.
+  bool needsInput() const override;
+
+  /// Always returns nullptr; output is pushed into the buffer manager as a
+  /// side-effect. Flushes the serializer when the buffer is full or the
+  /// pipeline is draining, then signals noMoreData() once all rows are sent.
+  RowVectorPtr getOutput() override;
+
+  BlockingReason isBlocked(ContinueFuture* future) override;
+
+  bool isFinished() override;
+
+ private:
+  /// Computes the serializer input columns and the mapping from output columns
+  /// to serializer input columns.
+  void initializeSerializerLayout();
+
+  /// Builds the RowVector consumed by the serializer. When the output layout
+  /// has duplicated columns, this projects only the distinct columns and
+  /// leaves duplication to flush time.
+  RowVectorPtr prepareSerializerInput(const RowVectorPtr& input) const;
+
+  /// Serializes all buffered rows into Presto pages and enqueues each page
+  /// into the output buffer manager. All destinations are always enqueued;
+  /// sets blockingReason_ and records a future if the output buffer is full.
+  /// Increments numFlushes_ on each call.
+  void flush();
+
+  const std::string taskId_;
+  const RowTypePtr inputType_;
+  const std::vector<column_index_t> keyChannels_;
+  /// Non-empty when the output layout differs from the input
+  const std::vector<column_index_t> outputChannels_;
+  const int32_t numDestinations_;
+
+  const bool replicateNullsAndAny_;
+  const std::weak_ptr<exec::OutputBufferManager> bufferManager_;
+  /// Holds a reference to the owning task to prevent it from being destroyed
+  /// while serialized pages are in flight inside the buffer manager.
+  const std::function<void()> bufferReleaseFn_;
+  const int64_t maxOutputBufferBytes_;
+
+  velox::memory::MemoryPool* pool_;
+
+  /// Computes per-row partition assignments. Null when numDestinations_ == 1.
+  std::unique_ptr<core::PartitionFunction> partitionFunction_;
+  /// Reusable buffer for per-row partition assignments.
+  std::vector<uint32_t> partitions_;
+
+  std::unique_ptr<serializer::presto::PrestoIterativePartitioningSerializer>
+      serializer_;
+  /// Row type passed to serializer_->append(). It only includes distinct
+  /// columns from the output layout.
+  RowTypePtr serializerInputType_;
+  /// Input channels that make up the serializer input type. Empty if the output
+  /// layout is the same as the input.
+  std::vector<column_index_t> serializerInputChannels_;
+  /// For each output column index, store the corresponding serializer input
+  /// column.
+  std::vector<column_index_t> serializerInputByOutput_;
+
+  BlockingReason blockingReason_{BlockingReason::kNotBlocked};
+  ContinueFuture future_;
+  bool finished_{false};
+
+  /// Counts addInput() calls that appended at least one row to the serializer.
+  /// Exposed as the "numAppendTimes" runtime stat.
+  uint64_t numAppends_{0};
+  /// Counts non-empty flush() calls — flushes that serialized at least one
+  /// row. Exposed as the "numFlushes" runtime stat for test verification.
+  uint64_t numFlushes_{0};
+  /// Counts flush() calls that caused the driver to block on a full output
+  /// buffer. Exposed as the "numBlockedTimes" runtime stat.
+  uint64_t numBlockedTimes_{0};
+};
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedVectorHasher.cpp b/velox/exec/OptimizedVectorHasher.cpp
new file mode 100644
index 00000000000..507ffc9edb1
--- /dev/null
+++ b/velox/exec/OptimizedVectorHasher.cpp
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/OptimizedVectorHasher.h"
+
+#include "velox/common/base/SimdUtil.h"
+#include "velox/type/FloatingPointUtil.h"
+
+namespace facebook::velox::exec {
+namespace {
+
+template <bool typeProvidesCustomComparison, TypeKind Kind>
+uint64_t hashOne(const DecodedVector& decoded, vector_size_t index) {
+  if constexpr (
+      Kind == TypeKind::ROW || Kind == TypeKind::ARRAY ||
+      Kind == TypeKind::MAP) {
+    return decoded.base()->hashValueAt(decoded.index(index));
+  } else {
+    using T = typename KindToFlatVector<Kind>::HashRowType;
+    const T value = decoded.valueAt<T>(index);
+
+    if constexpr (typeProvidesCustomComparison) {
+      return static_cast<const CanProvideCustomComparisonType<Kind>*>(
+                 decoded.base()->type().get())
+          ->hash(value);
+    } else if constexpr (std::is_floating_point_v<T>) {
+      return util::floating_point::NaNAwareHash<T>()(value);
+    } else {
+      return folly::hasher<T>()(value);
+    }
+  }
+}
+
+constexpr uint64_t kNullHash = OptimizedVectorHasher::kNullHash;
+
+// Fills `result[0..size)` with `hash`, mixing into the existing values when
+// `Mix` is true.
+template <bool Mix>
+inline void broadcastHash(vector_size_t size, uint64_t* result, uint64_t hash) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = bits::hashMix(result[i], hash);
+    }
+  } else {
+    std::fill(result, result + size, hash);
+  }
+}
+
+// Computes one hash per row via `computeHash(i)`. Caller guarantees no nulls.
+template <bool Mix, typename ComputeHash>
+inline void
+hashLoopNoNulls(vector_size_t size, uint64_t* result, ComputeHash computeHash) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = bits::hashMix(result[i], computeHash(i));
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = computeHash(i);
+    }
+  }
+}
+
+// Computes one hash per row, substituting `kNullHash` for null rows.
+template <bool Mix, typename ComputeHash>
+inline void hashLoopWithNulls(
+    vector_size_t size,
+    uint64_t* result,
+    const DecodedVector& decoded,
+    ComputeHash computeHash) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      const uint64_t hash = decoded.isNullAt(i) ? kNullHash : computeHash(i);
+      result[i] = bits::hashMix(result[i], hash);
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = decoded.isNullAt(i) ? kNullHash : computeHash(i);
+    }
+  }
+}
+
+template <bool Mix>
+inline void scatterDictionaryHashes(
+    vector_size_t size,
+    uint64_t* result,
+    const vector_size_t* indices,
+    const uint64_t* baseHashes) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = bits::hashMix(result[i], baseHashes[indices[i]]);
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] = baseHashes[indices[i]];
+    }
+  }
+}
+
+template <bool Mix>
+inline void scatterDictionaryHashesWithExtraNulls(
+    vector_size_t size,
+    uint64_t* result,
+    const vector_size_t* indices,
+    const uint64_t* nulls,
+    const uint64_t* baseHashes) {
+  if constexpr (Mix) {
+    for (vector_size_t i = 0; i < size; ++i) {
+      const uint64_t hash =
+          bits::isBitNull(nulls, i) ? kNullHash : baseHashes[indices[i]];
+      result[i] = bits::hashMix(result[i], hash);
+    }
+  } else {
+    for (vector_size_t i = 0; i < size; ++i) {
+      result[i] =
+          bits::isBitNull(nulls, i) ? kNullHash : baseHashes[indices[i]];
+    }
+  }
+}
+
+/// converts Velox’s packed boolean storage into one hash per row.
+/// @param values: a bitmap: one bit per row, where set means true and unset
+/// means false
+template <bool Mix>
+inline void scatterBoolHashes(
+    vector_size_t size,
+    uint64_t* result,
+    const uint64_t* values,
+    const uint64_t* nulls) {
+  using Batch = xsimd::batch<int64_t>;
+  static constexpr vector_size_t kSimdBatchSize = Batch::size;
+  const auto falseHash = folly::hasher<bool>()(false);
+  const auto trueHash = folly::hasher<bool>()(true);
+
+  vector_size_t row{0};
+  if constexpr (!Mix) {
+    const auto falseHashBatch =
+        xsimd::broadcast<int64_t>(static_cast<int64_t>(falseHash));
+    const auto trueHashBatch =
+        xsimd::broadcast<int64_t>(static_cast<int64_t>(trueHash));
+    const auto nullHashBatch =
+        xsimd::broadcast<int64_t>(static_cast<int64_t>(kNullHash));
+    auto* const signedResult = reinterpret_cast<int64_t*>(result);
+
+    for (; row + kSimdBatchSize <= size; row += kSimdBatchSize) {
+      const auto bitOffset = row & 63;
+      const auto valueBits = (values[row / 64] >> bitOffset) &
+          bits::lowMask(static_cast<int32_t>(kSimdBatchSize));
+      auto hashes = xsimd::select(
+          simd::fromBitMask<int64_t>(valueBits), trueHashBatch, falseHashBatch);
+
+      if (nulls != nullptr) {
+        const auto notNullBits = (nulls[row / 64] >> bitOffset) &
+            bits::lowMask(static_cast<int32_t>(kSimdBatchSize));
+        hashes = xsimd::select(
+            simd::fromBitMask<int64_t>(notNullBits), hashes, nullHashBatch);
+      }
+
+      hashes.store_unaligned(signedResult + row);
+    }
+  }
+
+  // TODO: improve performance
+  for (; row < size; ++row) {
+    const auto hash = nulls != nullptr && bits::isBitNull(nulls, row)
+        ? kNullHash
+        : (bits::isBitSet(values, row) ? trueHash : falseHash);
+    if constexpr (Mix) {
+      result[row] = bits::hashMix(result[row], hash);
+    } else {
+      result[row] = hash;
+    }
+  }
+}
+
+// Dispatches `body` with `Mix` resolved as a compile-time bool.
+template <typename Body>
+inline void dispatchMix(bool mix, Body body) {
+  if (mix) {
+    body(std::true_type{});
+  } else {
+    body(std::false_type{});
+  }
+}
+
+template <typename ComputeHash>
+inline void hashDecoded(
+    bool mix,
+    vector_size_t size,
+    uint64_t* result,
+    const DecodedVector& decoded,
+    ComputeHash computeHash) {
+  dispatchMix(mix, [&](auto mixTag) {
+    constexpr bool kMix = decltype(mixTag)::value;
+    if (decoded.mayHaveNulls()) {
+      hashLoopWithNulls<kMix>(size, result, decoded, computeHash);
+    } else {
+      hashLoopNoNulls<kMix>(size, result, computeHash);
+    }
+  });
+}
+
+} // namespace
+
+OptimizedVectorHasher::OptimizedVectorHasher(
+    TypePtr type,
+    column_index_t channel)
+    : channel_(channel),
+      type_(std::move(type)),
+      typeKind_(type_->kind()),
+      typeProvidesCustomComparison_(type_->providesCustomComparison()) {}
+
+void OptimizedVectorHasher::decode(
+    const BaseVector& vector,
+    const SelectivityVector& rows) {
+  VELOX_CHECK(
+      type_->kindEquals(vector.type()),
+      "Type mismatch: {} vs. {}",
+      type_->toString(),
+      vector.type()->toString());
+  decoded_.decode(vector, rows);
+}
+
+void OptimizedVectorHasher::hash(bool mix, raw_vector<uint64_t>& result) {
+  if (typeKind_ == TypeKind::UNKNOWN) {
+    dispatchMix(mix, [&](auto mixTag) {
+      broadcastHash<decltype(mixTag)::value>(
+          decoded_.size(), result.data(), kNullHash);
+    });
+  } else {
+    VELOX_DYNAMIC_TYPE_DISPATCH(hashValues, typeKind_, mix, result.data());
+  }
+}
+
+void OptimizedVectorHasher::hash(
+    const SelectivityVector& rows,
+    bool mix,
+    raw_vector<uint64_t>& result) {
+  if (decoded_.size() == 0 || result.empty() || rows.isAllSelected()) {
+    hash(mix, result);
+    return;
+  }
+
+  const auto original = result;
+
+  hash(mix, result);
+
+  // The specialized hash() path computes values for the full decoded extent.
+  // Restore rows that were not selected to match VectorHasher semantics.
+  for (vector_size_t row = 0; row < result.size(); ++row) {
+    if (!rows.isValid(row)) {
+      result[row] = original[row];
+    }
+  }
+}
+
+template <TypeKind Kind>
+void OptimizedVectorHasher::hashValues(bool mix, uint64_t* result) {
+  using T = typename TypeTraits<Kind>::NativeType;
+  if constexpr (
+      Kind == TypeKind::ROW || Kind == TypeKind::ARRAY ||
+      Kind == TypeKind::MAP) {
+    if (typeProvidesCustomComparison_) {
+      hashTyped<true, Kind>(mix, result);
+    } else {
+      hashTyped<false, Kind>(mix, result);
+    }
+    return;
+  }
+
+  if (decoded_.isConstantMapping() || !decoded_.isIdentityMapping() ||
+      typeProvidesCustomComparison_) {
+    if (typeProvidesCustomComparison_) {
+      hashTyped<true, Kind>(mix, result);
+    } else {
+      hashTyped<false, Kind>(mix, result);
+    }
+    return;
+  }
+  hashFlatValues<T>(mix, result);
+}
+
+template <bool typeProvidesCustomComparison, TypeKind Kind>
+void OptimizedVectorHasher::hashTyped(bool mix, uint64_t* result) {
+  const auto size = decoded_.size();
+
+  // Constant column: compute the value once and broadcast.
+  if (decoded_.isConstantMapping()) {
+    const uint64_t hash = decoded_.isNullAt(0)
+        ? kNullHash
+        : hashOne<typeProvidesCustomComparison, Kind>(decoded_, 0);
+    dispatchMix(mix, [&](auto mixTag) {
+      broadcastHash<decltype(mixTag)::value>(size, result, hash);
+    });
+    return;
+  }
+
+  // Dictionary mapping more rows than its base: calculate the hashes for the
+  // dictionary first, then scatter.
+  if (!decoded_.isIdentityMapping() && size > decoded_.base()->size()) {
+    const DecodedVector baseDecoded(*decoded_.base());
+    const auto baseSize = decoded_.base()->size();
+    dictionaryHashes_.resize(baseSize);
+    const auto computeBaseHash = [&](vector_size_t i) {
+      return hashOne<typeProvidesCustomComparison, Kind>(baseDecoded, i);
+    };
+    hashDecoded(
+        false,
+        baseSize,
+        dictionaryHashes_.data(),
+        baseDecoded,
+        computeBaseHash);
+
+    const auto* const indices = decoded_.indices();
+    dispatchMix(mix, [&](auto mixTag) {
+      constexpr bool kMix = decltype(mixTag)::value;
+      if (decoded_.hasExtraNulls()) {
+        scatterDictionaryHashesWithExtraNulls<kMix>(
+            size, result, indices, decoded_.nulls(), dictionaryHashes_.data());
+      } else {
+        scatterDictionaryHashes<kMix>(
+            size, result, indices, dictionaryHashes_.data());
+      }
+    });
+    return;
+  }
+
+  // Generic fallback
+  const auto computeHash = [&](vector_size_t i) {
+    return hashOne<typeProvidesCustomComparison, Kind>(decoded_, i);
+  };
+  hashDecoded(mix, size, result, decoded_, computeHash);
+}
+
+template <typename T>
+void OptimizedVectorHasher::hashFlatValues(bool mix, uint64_t* result) {
+  if constexpr (std::is_void_v<T>) {
+    VELOX_NYI();
+  } else {
+    const T* const values = decoded_.data<T>();
+    const auto size = decoded_.size();
+    const auto computeHash = [&](vector_size_t i) {
+      if constexpr (std::is_floating_point_v<T>) {
+        return util::floating_point::NaNAwareHash<T>()(values[i]);
+      } else {
+        return folly::hasher<T>()(values[i]);
+      }
+    };
+    hashDecoded(mix, size, result, decoded_, computeHash);
+  }
+}
+
+template <>
+void OptimizedVectorHasher::hashFlatValues<bool>(bool mix, uint64_t* result) {
+  const auto* const values = decoded_.data<uint64_t>();
+  const auto* const nulls =
+      decoded_.mayHaveNulls() ? decoded_.nulls() : nullptr;
+  dispatchMix(mix, [&](auto mixTag) {
+    scatterBoolHashes<decltype(mixTag)::value>(
+        decoded_.size(), result, values, nulls);
+  });
+}
+
+void OptimizedVectorHasher::hashPrecomputed(
+    bool mix,
+    raw_vector<uint64_t>& result) const {
+  dispatchMix(mix, [&](auto mixTag) {
+    broadcastHash<decltype(mixTag)::value>(
+        result.size(), result.data(), precomputedHash_);
+  });
+}
+
+void OptimizedVectorHasher::precompute(const BaseVector& value) {
+  if (value.isNullAt(0)) {
+    precomputedHash_ = kNullHash;
+    return;
+  }
+
+  decoded_.decode(value);
+  if (typeKind_ == TypeKind::UNKNOWN) {
+    precomputedHash_ = kNullHash;
+    return;
+  }
+
+  if (typeProvidesCustomComparison_) {
+    precomputedHash_ = VELOX_DYNAMIC_TEMPLATE_TYPE_DISPATCH(
+        hashOne, true, typeKind_, decoded_, 0);
+  } else {
+    precomputedHash_ = VELOX_DYNAMIC_TEMPLATE_TYPE_DISPATCH(
+        hashOne, false, typeKind_, decoded_, 0);
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/OptimizedVectorHasher.h b/velox/exec/OptimizedVectorHasher.h
new file mode 100644
index 00000000000..830b453abe8
--- /dev/null
+++ b/velox/exec/OptimizedVectorHasher.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/common/memory/RawVector.h"
+#include "velox/exec/Operator.h"
+#include "velox/vector/VectorTypeUtils.h"
+
+namespace facebook::velox::exec {
+
+class OptimizedVectorHasher {
+ public:
+  OptimizedVectorHasher(TypePtr type, column_index_t channel);
+
+  static std::unique_ptr<OptimizedVectorHasher> create(
+      TypePtr type,
+      column_index_t channel) {
+    return std::make_unique<OptimizedVectorHasher>(std::move(type), channel);
+  }
+
+  column_index_t channel() const {
+    return channel_;
+  }
+
+  // Decodes the 'vector' in preparation for calling hash() or
+  // computeValueIds(). The decoded vector can be accessed via decodedVector()
+  // getter.
+  void decode(const BaseVector& vector, const SelectivityVector& rows);
+
+  void hash(bool mix, raw_vector<uint64_t>& result);
+
+  void
+  hash(const SelectivityVector& rows, bool mix, raw_vector<uint64_t>& result);
+
+  void hashPrecomputed(bool mix, raw_vector<uint64_t>& result) const;
+
+  void precompute(const BaseVector& value);
+
+  static constexpr uint64_t kNullHash = BaseVector::kNullHash;
+
+  template <TypeKind Kind>
+  void hashValues(bool mix, uint64_t* result);
+
+ private:
+  template <bool typeProvidesCustomComparison, TypeKind Kind>
+  void hashTyped(bool mix, uint64_t* result);
+
+  template <typename T>
+  void hashFlatValues(bool mix, uint64_t* result);
+
+  const column_index_t channel_;
+  const TypePtr type_;
+  const TypeKind typeKind_;
+  const bool typeProvidesCustomComparison_;
+
+  DecodedVector decoded_;
+  raw_vector<uint64_t> dictionaryHashes_;
+  uint64_t precomputedHash_{0};
+};
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/PartitionedOutput.cpp b/velox/exec/PartitionedOutput.cpp
index ba4e23d738b..74320389489 100644
--- a/velox/exec/PartitionedOutput.cpp
+++ b/velox/exec/PartitionedOutput.cpp
@@ -207,10 +207,13 @@ PartitionedOutput::PartitionedOutput(
       numDestinations_(planNode->numPartitions()),
       replicateNullsAndAny_(planNode->isReplicateNullsAndAny()),
       partitionFunction_(
-          numDestinations_ == 1 ? nullptr
-                                : planNode->partitionFunctionSpec().create(
-                                      numDestinations_,
-                                      /*localExchange=*/false)),
+          numDestinations_ == 1
+              ? nullptr
+              : planNode->partitionFunctionSpec().create(
+                    numDestinations_,
+                    /*localExchange=*/false,
+                    ctx->queryConfig()
+                        .optimizedHashPartitionFunctionEnabled())),
       outputChannels_(calculateOutputChannels(
           planNode->inputType(),
           planNode->outputType(),
diff --git a/velox/exec/RoundRobinPartitionFunction.h b/velox/exec/RoundRobinPartitionFunction.h
index b84c6d2ffaf..a13ed529f55 100644
--- a/velox/exec/RoundRobinPartitionFunction.h
+++ b/velox/exec/RoundRobinPartitionFunction.h
@@ -43,7 +43,8 @@ class RoundRobinPartitionFunctionSpec : public core::PartitionFunctionSpec {
  public:
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool /*localExchange*/) const override {
+      bool /*localExchange*/,
+      bool /*useOptimizedPartitionFunction*/ = false) const override {
     return std::make_unique<velox::exec::RoundRobinPartitionFunction>(
         numPartitions);
   }
diff --git a/velox/exec/RowNumber.cpp b/velox/exec/RowNumber.cpp
index cd2cd4ce36a..04427975120 100644
--- a/velox/exec/RowNumber.cpp
+++ b/velox/exec/RowNumber.cpp
@@ -449,8 +449,14 @@ void RowNumber::setupInputSpiller(
     keyChannels.push_back(hasher->channel());
   }
 
-  spillHashFunction_ = std::make_unique<HashPartitionFunction>(
-      inputSpiller_->hashBits(), inputType_, keyChannels);
+  spillHashFunction_ = createHashPartitionFunction(
+      inputSpiller_->hashBits(),
+      inputType_,
+      keyChannels,
+      {},
+      operatorCtx_->driverCtx()
+          ->queryConfig()
+          .optimizedHashPartitionFunctionEnabled());
 }
 
 void RowNumber::spill() {
diff --git a/velox/exec/RowNumber.h b/velox/exec/RowNumber.h
index b34fc9d9c20..8e53713fc77 100644
--- a/velox/exec/RowNumber.h
+++ b/velox/exec/RowNumber.h
@@ -142,7 +142,7 @@ class RowNumber : public Operator {
   SpillPartitionSet spillInputPartitionSet_;
 
   // Used to calculate the spill partition numbers of the inputs.
-  std::unique_ptr<HashPartitionFunction> spillHashFunction_;
+  std::unique_ptr<HashPartitionFunctionBase> spillHashFunction_;
 
   // The cpu may be voluntarily yield after running too long when processing
   // input from spilled file.
diff --git a/velox/exec/ScaleWriterLocalPartition.cpp b/velox/exec/ScaleWriterLocalPartition.cpp
index 7530ff403a0..1764adabf6a 100644
--- a/velox/exec/ScaleWriterLocalPartition.cpp
+++ b/velox/exec/ScaleWriterLocalPartition.cpp
@@ -57,7 +57,10 @@ ScaleWriterPartitioningLocalPartition::ScaleWriterPartitioningLocalPartition(
       ? nullptr
       : planNode->partitionFunctionSpec().create(
             numTablePartitions_,
-            /*localExchange=*/true);
+            /*localExchange=*/true,
+            operatorCtx_->driverCtx()
+                ->queryConfig()
+                .optimizedHashPartitionFunctionEnabled());
 }
 
 void ScaleWriterPartitioningLocalPartition::initialize() {
diff --git a/velox/exec/SubPartitionedSortWindowBuild.cpp b/velox/exec/SubPartitionedSortWindowBuild.cpp
index 2f2a247a8d4..db437748fbb 100644
--- a/velox/exec/SubPartitionedSortWindowBuild.cpp
+++ b/velox/exec/SubPartitionedSortWindowBuild.cpp
@@ -22,6 +22,7 @@ namespace facebook::velox::exec {
 SubPartitionedSortWindowBuild::SubPartitionedSortWindowBuild(
     const std::shared_ptr<const core::WindowNode>& node,
     int32_t numSubPartitions,
+    const core::QueryConfig& queryConfig,
     velox::memory::MemoryPool* pool,
     common::PrefixSortConfig&& prefixSortConfig,
     const common::SpillConfig* spillConfig,
@@ -40,8 +41,13 @@ SubPartitionedSortWindowBuild::SubPartitionedSortWindowBuild(
   for (int i = 0; i < numPartitionKeys_; i++) {
     keyChannels[i] = inputChannels_[i];
   }
-  subPartitioningFunction_ = std::make_unique<HashPartitionFunction>(
-      false, numSubPartitions_, node->inputType(), keyChannels);
+  subPartitioningFunction_ = createHashPartitionFunction(
+      /*localExchange=*/false,
+      numSubPartitions_,
+      node->inputType(),
+      keyChannels,
+      {},
+      queryConfig.optimizedHashPartitionFunctionEnabled());
   subWindowBuilds_.resize(numSubPartitions_);
   for (int i = 0; i < numSubPartitions_; i++) {
     subWindowBuilds_[i] = std::make_unique<SortWindowBuild>(
@@ -59,7 +65,12 @@ void SubPartitionedSortWindowBuild::addInput(RowVectorPtr input) {
   VELOX_CHECK_LT(currentSubPartition_, 0);
 
   subPartitionIdsBuffer_.resize(input->size());
-  subPartitioningFunction_->partition(*input, subPartitionIdsBuffer_);
+  std::optional<uint32_t> singlePartition =
+      subPartitioningFunction_->partition(*input, subPartitionIdsBuffer_);
+  if (singlePartition.has_value()) {
+    simd::simdFill<uint32_t>(
+        subPartitionIdsBuffer_.data(), singlePartition.value(), input->size());
+  }
 
   for (auto i = 0; i < inputChannels_.size(); ++i) {
     decodedInputVectors_[i].decode(*input->childAt(inputChannels_[i]));
diff --git a/velox/exec/SubPartitionedSortWindowBuild.h b/velox/exec/SubPartitionedSortWindowBuild.h
index 8735f438d30..f0da95bdf95 100644
--- a/velox/exec/SubPartitionedSortWindowBuild.h
+++ b/velox/exec/SubPartitionedSortWindowBuild.h
@@ -33,6 +33,7 @@ class SubPartitionedSortWindowBuild : public WindowBuild {
   SubPartitionedSortWindowBuild(
       const std::shared_ptr<const core::WindowNode>& node,
       int32_t numSubPartitions,
+      const core::QueryConfig& queryConfig,
       velox::memory::MemoryPool* pool,
       common::PrefixSortConfig&& prefixSortConfig,
       const common::SpillConfig* spillConfig,
@@ -80,7 +81,7 @@ class SubPartitionedSortWindowBuild : public WindowBuild {
   exec::SpillStats* const spillStats_;
 
   // Divide input rows to the corresponding sub partitions.
-  std::unique_ptr<HashPartitionFunction> subPartitioningFunction_;
+  std::unique_ptr<HashPartitionFunctionBase> subPartitioningFunction_;
 
   // WindowBuilds for each sub partition.
   std::vector<std::unique_ptr<SortWindowBuild>> subWindowBuilds_;
diff --git a/velox/exec/Window.cpp b/velox/exec/Window.cpp
index f9107522f0a..b763371a801 100644
--- a/velox/exec/Window.cpp
+++ b/velox/exec/Window.cpp
@@ -75,6 +75,7 @@ Window::Window(
       windowBuild_ = std::make_unique<SubPartitionedSortWindowBuild>(
           windowNode,
           numSubPartitions,
+          driverCtx->queryConfig(),
           pool(),
           makePrefixSortConfig(driverCtx->queryConfig()),
           spillConfig,
diff --git a/velox/exec/benchmarks/CMakeLists.txt b/velox/exec/benchmarks/CMakeLists.txt
index 7a721bf91a6..3ccff61baae 100644
--- a/velox/exec/benchmarks/CMakeLists.txt
+++ b/velox/exec/benchmarks/CMakeLists.txt
@@ -20,6 +20,27 @@ target_link_libraries(
   Folly::follybenchmark
 )
 
+add_executable(velox_exec_optimized_vector_hasher_benchmark OptimizedVectorHasherBenchmark.cpp)
+
+target_link_libraries(
+  velox_exec_optimized_vector_hasher_benchmark
+  velox_exec
+  velox_vector_test_lib
+  Folly::follybenchmark
+)
+
+add_executable(
+  velox_exec_optimized_hash_partition_function_benchmark
+  OptimizedHashPartitionFunctionBenchmark.cpp
+)
+
+target_link_libraries(
+  velox_exec_optimized_hash_partition_function_benchmark
+  velox_exec
+  velox_vector_test_lib
+  Folly::follybenchmark
+)
+
 add_executable(velox_filter_project_benchmark FilterProjectBenchmark.cpp)
 
 target_link_libraries(
@@ -40,6 +61,16 @@ target_link_libraries(
   Folly::follybenchmark
 )
 
+add_executable(velox_local_exchange_benchmark LocalExchangeBenchmark.cpp)
+
+target_link_libraries(
+  velox_local_exchange_benchmark
+  velox_exec
+  velox_exec_test_lib
+  velox_vector_test_lib
+  Folly::follybenchmark
+)
+
 add_executable(velox_merge_benchmark MergeBenchmark.cpp)
 
 target_link_libraries(
diff --git a/velox/exec/benchmarks/ExchangeBenchmark.cpp b/velox/exec/benchmarks/ExchangeBenchmark.cpp
index 45689ccbf64..d204f4ed666 100644
--- a/velox/exec/benchmarks/ExchangeBenchmark.cpp
+++ b/velox/exec/benchmarks/ExchangeBenchmark.cpp
@@ -17,7 +17,6 @@
 #include <folly/init/Init.h>
 
 #include "velox/core/QueryConfig.h"
-#include "velox/dwio/common/tests/utils/BatchMaker.h"
 #include "velox/exec/Exchange.h"
 #include "velox/exec/PlanNodeStats.h"
 #include "velox/exec/tests/utils/AssertQueryBuilder.h"
@@ -32,15 +31,13 @@
 DEFINE_int32(width, 16, "Number of parties in shuffle");
 DEFINE_int32(task_width, 4, "Number of threads in each task in shuffle");
 
-DEFINE_int32(num_local_tasks, 8, "Number of concurrent local shuffles");
-DEFINE_int32(num_local_repeat, 8, "Number of repeats of local exchange query");
-DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch.");
-DEFINE_int64(
-    local_exchange_buffer_mb,
-    32,
-    "task-wide buffer in local exchange");
 DEFINE_int64(exchange_buffer_mb, 32, "task-wide buffer in remote exchange");
-DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary");
+DEFINE_int32(
+    dict_pct,
+    0,
+    "Percentage of vectors per column wrapped in dictionary encoding. "
+    "Applied independently to each column across all generated row vectors "
+    "and recursively to nested children.");
 // Add the following definitions to allow Clion runs
 DEFINE_bool(gtest_color, false, "");
 DEFINE_string(gtest_filter, "*", "");
@@ -59,70 +56,401 @@ using namespace facebook::velox::test;
 
 namespace {
 
-struct LocalPartitionWaitStats {
-  int64_t totalProducerWaitMs = 0;
-  int64_t totalConsumerWaitMs = 0;
-  std::vector<RuntimeMetric> consumerWaitMs;
-  std::vector<RuntimeMetric> producerWaitMs;
-  std::vector<int64_t> wallMs;
+bool shouldWrapVector(
+    int32_t vectorIndex,
+    int32_t numVectors,
+    int32_t dictPct) {
+  VELOX_CHECK_GE(dictPct, 0);
+  VELOX_CHECK_LE(dictPct, 100);
+  return dictPct > 0 && (vectorIndex * 100) / numVectors < dictPct;
+}
+
+void wrapDictionaryRecursive(VectorPtr& vector) {
+  if (!vector) {
+    return;
+  }
+
+  switch (vector->encoding()) {
+    case VectorEncoding::Simple::ROW: {
+      auto row = vector->as<RowVector>();
+      for (auto i = 0; i < row->childrenSize(); ++i) {
+        wrapDictionaryRecursive(row->childAt(i));
+      }
+      break;
+    }
+    case VectorEncoding::Simple::ARRAY: {
+      auto array = vector->as<ArrayVector>();
+      auto elements = array->elements();
+      wrapDictionaryRecursive(elements);
+      array->setElements(std::move(elements));
+      break;
+    }
+    case VectorEncoding::Simple::MAP: {
+      auto map = vector->as<MapVector>();
+      auto keys = map->mapKeys();
+      auto values = map->mapValues();
+      wrapDictionaryRecursive(keys);
+      wrapDictionaryRecursive(values);
+      map->setKeysAndValues(std::move(keys), std::move(values));
+      break;
+    }
+    default:
+      break;
+  }
+
+  auto indices = facebook::velox::test::makeIndices(
+      vector->size(), [](auto row) { return row; }, vector->pool());
+  vector =
+      BaseVector::wrapInDictionary(nullptr, indices, vector->size(), vector);
+}
+
+struct ExchangeRunStats {
+  int64_t wallUs = 0;
+  PlanNodeStats partitionedOutputStats;
+  PlanNodeStats exchangeStats;
+};
+
+enum class ExchangeMode {
+  kNormal,
+  kOptimized,
+};
+
+/// Column element type dimension for simple-schema exchange benchmarks.
+enum class SimpleColType {
+  kBoolean,
+  kTinyint,
+  kInteger,
+  kBigint,
+  kHugeint,
+  kLongDecimal,
+  kDouble,
+};
+
+TypePtr simpleColTypeToType(SimpleColType colType) {
+  switch (colType) {
+    case SimpleColType::kBoolean:
+      return BOOLEAN();
+    case SimpleColType::kTinyint:
+      return TINYINT();
+    case SimpleColType::kInteger:
+      return INTEGER();
+    case SimpleColType::kBigint:
+      return BIGINT();
+    case SimpleColType::kHugeint:
+      return HUGEINT();
+    case SimpleColType::kLongDecimal:
+      return DECIMAL(20, 3);
+    case SimpleColType::kDouble:
+      return DOUBLE();
+  }
+  VELOX_UNREACHABLE();
+}
+
+std::string simpleColTypeName(SimpleColType colType) {
+  switch (colType) {
+    case SimpleColType::kBoolean:
+      return "Boolean";
+    case SimpleColType::kTinyint:
+      return "Tinyint";
+    case SimpleColType::kInteger:
+      return "Integer";
+    case SimpleColType::kBigint:
+      return "Bigint";
+    case SimpleColType::kHugeint:
+      return "Hugeint";
+    case SimpleColType::kLongDecimal:
+      return "LongDecimal";
+    case SimpleColType::kDouble:
+      return "Double";
+  }
+  VELOX_UNREACHABLE();
+}
+
+enum class ExchangeInputKind {
+  kDeep10K,
+  kDeep50,
+  kStruct1K,
+};
+
+struct ExchangeInputSpec {
+  std::string name;
+  RowTypePtr type;
+  int32_t numVectors;
+  int32_t rowsPerVector;
+};
+
+struct ExchangeBenchmarkResult {
+  std::string datasetName;
+  ExchangeMode mode;
+  ExchangeRunStats stats;
 };
 
-void sortByMax(std::vector<RuntimeMetric>& metrics) {
-  std::sort(
-      metrics.begin(),
-      metrics.end(),
-      [](const RuntimeMetric& left, const RuntimeMetric& right) {
-        return left.max > right.max;
-      });
+std::vector<ExchangeBenchmarkResult> benchmarkResults;
+
+std::string modeName(ExchangeMode mode) {
+  switch (mode) {
+    case ExchangeMode::kNormal:
+      return "normal";
+    case ExchangeMode::kOptimized:
+      return "optimized";
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+/// Creates a simple row type with `numCols` columns all of type `colType`.
+RowTypePtr makeSimpleType(const TypePtr& colType, int32_t numCols) {
+  std::vector<std::string> names;
+  std::vector<TypePtr> types;
+  names.reserve(numCols);
+  types.reserve(numCols);
+  for (int32_t i = 0; i < numCols; ++i) {
+    names.push_back(fmt::format("c{}", i));
+    types.push_back(colType);
+  }
+  return ROW(std::move(names), std::move(types));
+}
+
+RowTypePtr makeStructType() {
+  return ROW(
+      {{"c0", BIGINT()},
+       {"r1",
+        ROW(
+            {{"k2", BIGINT()},
+             {"r2",
+              ROW(
+                  {{"i1", BIGINT()},
+                   {"i2", BIGINT()},
+                   {"r3",
+                    ROW(
+                        {{"s3", VARCHAR()},
+                         {"i5", INTEGER()},
+                         {"d5", DOUBLE()},
+                         {"b5", BOOLEAN()},
+                         {"a5", ARRAY(TINYINT())}})}})}})}});
+}
+
+RowTypePtr makeDeepType() {
+  return ROW(
+      {{"c0", BIGINT()},
+       {"long_array_val", ARRAY(ARRAY(BIGINT()))},
+       {"array_val", ARRAY(VARCHAR())},
+       {"struct_val", ROW({{"s_int", INTEGER()}, {"s_array", ARRAY(REAL())}})},
+       {"map_val",
+        MAP(VARCHAR(),
+            MAP(BIGINT(),
+                ROW({{"s2_int", INTEGER()}, {"s2_string", VARCHAR()}})))}});
 }
 
-void sortByAndPrintMax(
-    const char* title,
-    int64_t total,
-    std::vector<RuntimeMetric>& metrics) {
-  sortByMax(metrics);
-  VELOX_CHECK(!metrics.empty());
-  std::cout << title << "\n Total " << succinctNanos(total)
-            << "\n Max: " << metrics.front().toString()
-            << "\n Median: " << metrics[metrics.size() / 2].toString()
-            << "\n Min: " << metrics.back().toString() << std::endl;
+ExchangeInputSpec makeInputSpec(ExchangeInputKind kind) {
+  switch (kind) {
+    case ExchangeInputKind::kDeep10K:
+      return {"Deep10K", makeDeepType(), 10, 10000};
+    case ExchangeInputKind::kDeep50:
+      return {"Deep50", makeDeepType(), 2000, 50};
+    case ExchangeInputKind::kStruct1K:
+      return {"Struct1K", makeStructType(), 100, 1000};
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+ExchangeInputSpec makeInputSpec(SimpleColType colType, int32_t numCols) {
+  return {
+      fmt::format("Simple10K_{}_col{}", simpleColTypeName(colType), numCols),
+      makeSimpleType(simpleColTypeToType(colType), numCols),
+      10,
+      10'000};
+}
+
+std::string formatStat(const ExchangeRunStats* stats, auto formatter) {
+  if (stats == nullptr) {
+    return "N/A";
+  }
+  return formatter(*stats);
+}
+
+void printAllExchangeStats() {
+  struct PairedStats {
+    const ExchangeRunStats* normal = nullptr;
+    const ExchangeRunStats* optimized = nullptr;
+  };
+
+  std::vector<std::string> datasetOrder;
+  std::unordered_map<std::string, PairedStats> groupedStats;
+  for (const auto& result : benchmarkResults) {
+    auto [it, inserted] =
+        groupedStats.try_emplace(result.datasetName, PairedStats{});
+    if (inserted) {
+      datasetOrder.push_back(result.datasetName);
+    }
+    if (result.mode == ExchangeMode::kNormal) {
+      it->second.normal = &result.stats;
+    } else {
+      it->second.optimized = &result.stats;
+    }
+  }
+
+  for (const auto& datasetName : datasetOrder) {
+    const auto statsIt = groupedStats.find(datasetName);
+    VELOX_CHECK(statsIt != groupedStats.end());
+    const auto& paired = statsIt->second;
+    std::cout << "--------------------" << datasetName << "--------------------"
+              << std::endl;
+    std::cout << "Wall Time (ms) | normal: "
+              << formatStat(
+                     paired.normal,
+                     [](const ExchangeRunStats& stats) {
+                       return succinctMicros(stats.wallUs);
+                     })
+              << " | optimized: "
+              << formatStat(
+                     paired.optimized,
+                     [](const ExchangeRunStats& stats) {
+                       return succinctMicros(stats.wallUs);
+                     })
+              << std::endl;
+    std::cout << "Normal" << std::endl
+              << " - PartitionedOutput: "
+              << formatStat(
+                     paired.normal,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.partitionedOutputStats.toString();
+                     })
+              << std::endl
+              << " - Exchange: "
+              << formatStat(
+                     paired.normal,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.exchangeStats.toString();
+                     })
+              << std::endl;
+    std::cout << "Optimized" << std::endl
+              << " - PartitionedOutput: "
+              << formatStat(
+                     paired.optimized,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.partitionedOutputStats.toString();
+                     })
+              << std::endl
+              << " - Exchange: "
+              << formatStat(
+                     paired.optimized,
+                     [](const ExchangeRunStats& stats) {
+                       return stats.exchangeStats.toString();
+                     })
+              << std::endl;
+  }
+}
+
+template <typename Fn>
+ExchangeRunStats runBenchmarkIterations(unsigned int iters, Fn&& runOnce) {
+  ExchangeRunStats stats;
+  while (iters--) {
+    stats = runOnce();
+  }
+  return stats;
 }
 
 class ExchangeBenchmark : public VectorTestBase {
  public:
+  /// Creates a single flat column of `type` with `numRows` rows.
+  /// Approximately `nullPct` percent of rows are set to null, distributed
+  /// uniformly (row % 100 < nullPct). Non-null values are sequential integers
+  /// cast to the native type.
+  VectorPtr makeColumn(const TypePtr& type, int32_t numRows, int32_t nullPct) {
+    std::function<bool(vector_size_t)> isNull;
+    if (nullPct == 100) {
+      isNull = [](auto) { return true; };
+    } else if (nullPct > 0) {
+      isNull = [nullPct](vector_size_t row) { return (row % 100) < nullPct; };
+    }
+
+    switch (type->kind()) {
+      case TypeKind::BOOLEAN:
+        return makeFlatVector<bool>(
+            numRows, [](auto row) { return row % 2 == 0; }, isNull);
+      case TypeKind::TINYINT:
+        return makeFlatVector<int8_t>(
+            numRows, [](auto row) { return static_cast<int8_t>(row); }, isNull);
+      case TypeKind::SMALLINT:
+        return makeFlatVector<int16_t>(
+            numRows,
+            [](auto row) { return static_cast<int16_t>(row); },
+            isNull);
+      case TypeKind::INTEGER:
+        return makeFlatVector<int32_t>(
+            numRows, [](auto row) { return row; }, isNull);
+      case TypeKind::BIGINT:
+        // Handles plain BIGINT and short-decimal columns (DECIMAL(p,s), p≤18).
+        return makeFlatVector<int64_t>(
+            numRows,
+            [](auto row) { return static_cast<int64_t>(row); },
+            isNull,
+            type);
+      case TypeKind::REAL:
+        return makeFlatVector<float>(
+            numRows, [](auto row) { return static_cast<float>(row); }, isNull);
+      case TypeKind::DOUBLE:
+        return makeFlatVector<double>(
+            numRows, [](auto row) { return static_cast<double>(row); }, isNull);
+      case TypeKind::HUGEINT:
+        // Handles long-decimal columns (DECIMAL(p,s), p>18).
+        return makeFlatVector<int128_t>(
+            numRows,
+            [](auto row) { return static_cast<int128_t>(row); },
+            isNull,
+            type);
+      default:
+        VELOX_NYI(
+            "makeColumn does not support complex type {} yet",
+            type->toString());
+    }
+  }
+
+  /// Generates input batches for the exchange benchmark.
+  ///
+  /// `dictPct` is the percentage of vectors for each column that should be
+  /// wrapped in dictionary encoding across the full set of generated batches.
+  /// For example, with `numVectors = 10` and `dictPct = 30`, each top-level
+  /// column will have 3 dictionary-encoded vectors and 7 simple vectors.
+  /// Nested children of complex columns use the same rule recursively.
+  ///
+  /// `nullPct` controls what fraction of values in each column are null:
+  /// 0 = no nulls, 50 = half the rows null, 100 = all rows null.
   std::vector<RowVectorPtr> makeRows(
-      RowTypePtr type,
+      const RowTypePtr& type,
       int32_t numVectors,
       int32_t rowsPerVector,
-      int32_t dictPct = 0) {
+      int32_t dictPct = 0,
+      int32_t nullPct = 0) {
     std::vector<RowVectorPtr> vectors;
-    BufferPtr indices;
+    vectors.reserve(numVectors);
     for (int32_t i = 0; i < numVectors; ++i) {
-      auto vector = std::dynamic_pointer_cast<RowVector>(
-          BatchMaker::createBatch(type, rowsPerVector, *pool_));
-
-      auto width = vector->childrenSize();
-      for (auto child = 0; child < width; ++child) {
-        if (100 * child / width > dictPct) {
-          if (!indices) {
-            indices = makeIndices(vector->size(), [&](auto i) { return i; });
-          }
-          vector->childAt(child) = BaseVector::wrapInDictionary(
-              nullptr, indices, vector->size(), vector->childAt(child));
+      std::vector<VectorPtr> children;
+      children.reserve(type->size());
+      for (int32_t col = 0; col < type->size(); ++col) {
+        children.push_back(
+            makeColumn(type->childAt(col), rowsPerVector, nullPct));
+      }
+      auto vector = makeRowVector(type->names(), children);
+      if (shouldWrapVector(i, numVectors, dictPct)) {
+        for (auto child = 0; child < vector->childrenSize(); ++child) {
+          wrapDictionaryRecursive(vector->childAt(child));
         }
       }
-      vectors.push_back(vector);
+      vectors.push_back(std::move(vector));
     }
     return vectors;
   }
 
-  void run(
-      std::vector<RowVectorPtr>& vectors,
+  ExchangeRunStats run(
+      const std::vector<RowVectorPtr>& vectors,
       int32_t width,
       int32_t taskWidth,
-      int64_t& wallUs,
-      PlanNodeStats& partitionedOutputStats,
-      PlanNodeStats& exchangeStats) {
+      ExchangeMode mode) {
+    VELOX_CHECK(!vectors.empty());
+
     core::PlanNodePtr plan;
     core::PlanNodeId exchangeId;
     core::PlanNodeId leafPartitionedOutputId;
@@ -136,9 +464,7 @@ class ExchangeBenchmark : public VectorTestBase {
 
     const auto startUs = getCurrentTimeMicro();
     BENCHMARK_SUSPEND {
-      assert(!vectors.empty());
-      configSettings_[core::QueryConfig::kMaxPartitionedOutputBufferSize] =
-          fmt::format("{}", FLAGS_exchange_buffer_mb << 20);
+      configureQuerySettings(mode);
       const auto iteration = ++iteration_;
 
       // leafPlan: PartitionedOutput/kPartitioned(1) <-- Values(0)
@@ -159,7 +485,6 @@ class ExchangeBenchmark : public VectorTestBase {
 
       // finalAggPlan: PartitionedOutput/kPartitioned(2) <-- Agg/kSingle(1) <--
       // Exchange(0)
-      std::vector<std::string> finalAggTaskIds;
       core::PlanNodePtr finalAggPlan =
           exec::test::PlanBuilder()
               .exchange(leafPlan->outputType(), "Presto")
@@ -194,139 +519,44 @@ class ExchangeBenchmark : public VectorTestBase {
         .splits(finalAggSplits)
         .assertResults(expected);
 
+    ExchangeRunStats stats;
     BENCHMARK_SUSPEND {
-      wallUs = getCurrentTimeMicro() - startUs;
-      std::vector<int64_t> taskWallMs;
+      stats.wallUs = getCurrentTimeMicro() - startUs;
 
       for (const auto& task : leafTasks) {
         const auto& taskStats = task->taskStats();
-        taskWallMs.push_back(
-            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
         const auto& planStats = toPlanStats(taskStats);
         auto& taskPartitionedOutputStats =
             planStats.at(leafPartitionedOutputId);
-        partitionedOutputStats += taskPartitionedOutputStats;
+        stats.partitionedOutputStats += taskPartitionedOutputStats;
       }
 
       for (const auto& task : finalAggTasks) {
         const auto& taskStats = task->taskStats();
-        taskWallMs.push_back(
-            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
         const auto& planStats = toPlanStats(taskStats);
 
         auto& taskPartitionedOutputStats =
             planStats.at(finalAggPartitionedOutputId);
-        partitionedOutputStats += taskPartitionedOutputStats;
+        stats.partitionedOutputStats += taskPartitionedOutputStats;
 
         auto& taskExchangeStats = planStats.at(exchangeId);
-        exchangeStats += taskExchangeStats;
-      }
-    };
-  }
-
-  void runLocal(
-      std::vector<RowVectorPtr>& vectors,
-      int32_t taskWidth,
-      int32_t numTasks,
-      int64_t& localPartitionWallUs,
-      PlanNodeStats& partitionedOutputStats,
-      LocalPartitionWaitStats& localPartitionWaitStats) {
-    assert(!vectors.empty());
-
-    core::PlanNodePtr plan;
-    core::PlanNodeId localPartitionId1;
-    core::PlanNodeId localPartitionId2;
-    std::vector<std::shared_ptr<Task>> tasks;
-    std::vector<std::thread> threads;
-
-    RowVectorPtr expected;
-
-    BENCHMARK_SUSPEND {
-      std::vector<std::string> aggregates = {"count(1)"};
-      auto& rowType = vectors[0]->type()->as<TypeKind::ROW>();
-      for (auto i = 1; i < rowType.size(); ++i) {
-        aggregates.push_back(fmt::format("checksum({})", rowType.nameOf(i)));
+        stats.exchangeStats += taskExchangeStats;
       }
-
-      // plan: Agg/kSingle(4) <-- LocalPartition/Gather(3) <-- Agg/kGather(2)
-      // <-- LocalPartition/kRepartition(1) <-- Values(0)
-      plan = exec::test::PlanBuilder()
-                 .values(vectors, true)
-                 .localPartition({"c0"})
-                 .capturePlanNodeId(localPartitionId1)
-                 .singleAggregation({}, aggregates)
-                 .localPartition(std::vector<std::string>{})
-                 .capturePlanNodeId(localPartitionId2)
-                 .singleAggregation({}, {"sum(a0)"})
-                 .planNode();
-
-      threads.reserve(numTasks);
-      expected = makeRowVector({makeFlatVector<int64_t>(1, [&](auto /*row*/) {
-        return vectors.size() * vectors[0]->size() * taskWidth;
-      })});
     };
 
-    auto startMicros = getCurrentTimeMicro();
-    std::mutex mutex;
-    for (int32_t i = 0; i < numTasks; ++i) {
-      threads.push_back(std::thread([&]() {
-        for (auto repeat = 0; repeat < FLAGS_num_local_repeat; ++repeat) {
-          auto task =
-              exec::test::AssertQueryBuilder(plan)
-                  .config(
-                      core::QueryConfig::kMaxLocalExchangeBufferSize,
-                      fmt::format("{}", FLAGS_local_exchange_buffer_mb << 20))
-                  .maxDrivers(taskWidth)
-                  .assertResults(expected);
-          {
-            std::lock_guard<std::mutex> l(mutex);
-            tasks.push_back(task);
-          }
-        }
-      }));
-    }
-    for (auto& thread : threads) {
-      thread.join();
-    }
-
-    BENCHMARK_SUSPEND {
-      localPartitionWallUs = getCurrentTimeMicro() - startMicros;
-
-      std::vector<core::PlanNodeId> localPartitionNodeIds{
-          localPartitionId1, localPartitionId2};
-
-      localPartitionWaitStats.totalProducerWaitMs = 0;
-      localPartitionWaitStats.totalConsumerWaitMs = 0;
-      for (const auto& task : tasks) {
-        auto taskStats = task->taskStats();
-        localPartitionWaitStats.wallMs.push_back(
-            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
-        auto planStats = toPlanStats(taskStats);
-
-        for (const auto& nodeId : localPartitionNodeIds) {
-          auto& taskLocalPartition1Stats = planStats.at(nodeId);
-          partitionedOutputStats += taskLocalPartition1Stats;
-
-          auto& taskLocalPartition1RuntimeStats =
-              taskLocalPartition1Stats.customStats;
-          localPartitionWaitStats.producerWaitMs.push_back(
-              taskLocalPartition1RuntimeStats
-                  ["blockedWaitForProducerWallNanos"]);
-          localPartitionWaitStats.consumerWaitMs.push_back(
-              taskLocalPartition1RuntimeStats
-                  ["blockedWaitForConsumerWallNanos"]);
-          localPartitionWaitStats.totalProducerWaitMs +=
-              localPartitionWaitStats.producerWaitMs.back().sum;
-          localPartitionWaitStats.totalConsumerWaitMs +=
-              localPartitionWaitStats.consumerWaitMs.back().sum;
-        }
-      }
-    };
+    return stats;
   }
 
  private:
   static constexpr int64_t kMaxMemory = 6UL << 30; // 6GB
 
+  void configureQuerySettings(ExchangeMode mode) {
+    configSettings_[core::QueryConfig::kMaxPartitionedOutputBufferSize] =
+        fmt::format("{}", FLAGS_exchange_buffer_mb << 20);
+    configSettings_[core::QueryConfig::kOptimizedPartitionedOutputEnabled] =
+        mode == ExchangeMode::kOptimized ? "true" : "false";
+  }
+
   static std::string
   makeTaskId(int32_t iteration, const std::string& prefix, int num) {
     return fmt::format("local://{}-{}-{}", iteration, prefix, num);
@@ -373,223 +603,131 @@ int32_t ExchangeBenchmark::iteration_;
 
 std::unique_ptr<ExchangeBenchmark> bm;
 
-void runBenchmarks() {
-  std::vector<std::string> flatNames = {"c0"};
-  std::vector<TypePtr> flatTypes = {BIGINT()};
-  std::vector<TypePtr> typeSelection = {
-      BOOLEAN(),
-      TINYINT(),
-      DECIMAL(20, 3),
-      INTEGER(),
-      BIGINT(),
-      REAL(),
-      DECIMAL(10, 2),
-      DOUBLE(),
-      VARCHAR()};
-
-  int64_t flatSize = 0;
-  // Add enough columns of different types to make a 10K row batch be
-  // flat_batch_mb in flat size.
-  while (flatSize * 10000 < static_cast<int64_t>(FLAGS_flat_batch_mb) << 20) {
-    flatNames.push_back(fmt::format("c{}", flatNames.size()));
-    assert(!flatNames.empty());
-    flatTypes.push_back(typeSelection[flatTypes.size() % typeSelection.size()]);
-    if (flatTypes.back()->isFixedWidth()) {
-      flatSize += flatTypes.back()->cppSizeInBytes();
-    } else {
-      flatSize += 20;
-    }
-  }
-  auto flatType = ROW(std::move(flatNames), std::move(flatTypes));
-
-  auto structType = ROW(
-      {{"c0", BIGINT()},
-       {"r1",
-        ROW(
-            {{"k2", BIGINT()},
-             {"r2",
-              ROW(
-                  {{"i1", BIGINT()},
-                   {"i2", BIGINT()},
-                   {"r3}, ROW({{s3", VARCHAR()},
-                   {"i5", INTEGER()},
-                   {"d5", DOUBLE()},
-                   {"b5", BOOLEAN()},
-                   {"a5", ARRAY(TINYINT())}})}})}});
-
-  auto deepType = ROW(
-      {{"c0", BIGINT()},
-       {"long_array_val", ARRAY(ARRAY(BIGINT()))},
-       {"array_val", ARRAY(VARCHAR())},
-       {"struct_val", ROW({{"s_int", INTEGER()}, {"s_array", ARRAY(REAL())}})},
-       {"map_val",
-        MAP(VARCHAR(),
-            MAP(BIGINT(),
-                ROW({{"s2_int", INTEGER()}, {"s2_string", VARCHAR()}})))}});
-
-  std::vector<RowVectorPtr> flat10k(
-      bm->makeRows(flatType, 10, 10000, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> deep10k(
-      bm->makeRows(deepType, 10, 10000, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> flat50(
-      bm->makeRows(flatType, 2000, 50, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> deep50(
-      bm->makeRows(deepType, 2000, 50, FLAGS_dict_pct));
-  std::vector<RowVectorPtr> struct1k(
-      bm->makeRows(structType, 100, 1000, FLAGS_dict_pct));
-
-  int64_t flat10KWallUs;
-  PlanNodeStats partitionedOutputStatsFlat10K;
-  PlanNodeStats exchangeStatsFlat10K;
-  folly::addBenchmark(__FILE__, "exchangeFlat10k", [&]() {
-    bm->run(
-        flat10k,
-        FLAGS_width,
-        FLAGS_task_width,
-        flat10KWallUs,
-        partitionedOutputStatsFlat10K,
-        exchangeStatsFlat10K);
-    return 1;
-  });
-
-  int64_t flat50KWallUs;
-  PlanNodeStats partitionedOutputStatsFlat50;
-  PlanNodeStats exchangeStatsFlat50;
-  folly::addBenchmark(__FILE__, "exchangeFlat50", [&]() {
-    bm->run(
-        flat50,
-        FLAGS_width,
-        FLAGS_task_width,
-        flat50KWallUs,
-        partitionedOutputStatsFlat50,
-        exchangeStatsFlat50);
-    return 1;
-  });
-
-  int64_t deep10KWallUs;
-  PlanNodeStats partitionedOutputStatsDeep10K;
-  PlanNodeStats exchangeStatsDeep10K;
-  folly::addBenchmark(__FILE__, "exchangeDeep10k", [&]() {
-    bm->run(
-        deep10k,
-        FLAGS_width,
-        FLAGS_task_width,
-        deep10KWallUs,
-        partitionedOutputStatsDeep10K,
-        exchangeStatsDeep10K);
-    return 1;
-  });
-
-  int64_t deep50KWallUs;
-  PlanNodeStats partitionedOutputStatsDeep50;
-  PlanNodeStats exchangeStatsDeep50;
-  folly::addBenchmark(__FILE__, "exchangeDeep50", [&]() {
-    bm->run(
-        deep50,
-        FLAGS_width,
-        FLAGS_task_width,
-        deep50KWallUs,
-        partitionedOutputStatsDeep50,
-        exchangeStatsDeep50);
-    return 1;
-  });
-
-  int64_t stuct1KWallUs;
-  PlanNodeStats partitionedOutputStatsStruct1K;
-  PlanNodeStats exchangeStatsStruct1K;
-  folly::addBenchmark(__FILE__, "exchangeStruct1K", [&]() {
-    bm->run(
-        struct1k,
-        FLAGS_width,
-        FLAGS_task_width,
-        stuct1KWallUs,
-        partitionedOutputStatsStruct1K,
-        exchangeStatsStruct1K);
-    return 1;
-  });
-
-  int64_t localPartitionWallUs;
-  PlanNodeStats localPartitionStatsFlat10K;
-  LocalPartitionWaitStats localPartitionWaitStats;
-  folly::addBenchmark(__FILE__, "localFlat10k", [&]() {
-    bm->runLocal(
-        flat10k,
-        FLAGS_width,
-        FLAGS_num_local_tasks,
-        localPartitionWallUs,
-        localPartitionStatsFlat10K,
-        localPartitionWaitStats);
-    return 1;
+void benchmarkExchange(
+    unsigned int iters,
+    const ExchangeInputSpec& input,
+    ExchangeMode mode,
+    int32_t dictPct,
+    int32_t nullPct) {
+  auto vectors = bm->makeRows(
+      input.type, input.numVectors, input.rowsPerVector, dictPct, nullPct);
+  auto stats = runBenchmarkIterations(iters, [&]() {
+    return bm->run(vectors, FLAGS_width, FLAGS_task_width, mode);
   });
+  benchmarkResults.push_back(
+      {fmt::format("{}_dict{}_null{}", input.name, dictPct, nullPct),
+       mode,
+       std::move(stats)});
+}
 
-  folly::runBenchmarks();
+#define EXCHANGE_BENCHMARK_NAMED_PARAM(name, param_name, ...) \
+  BENCHMARK_IMPL(                                             \
+      FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)),    \
+      FOLLY_PP_STRINGIZE(param_name),                         \
+      iters,                                                  \
+      unsigned,                                               \
+      iters) {                                                \
+    name(iters, ##__VA_ARGS__);                               \
+  }
 
-  std::cout
-      << "----------------------------------Flat10K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(flat10KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsFlat10K.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsFlat10K.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Flat50K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(flat50KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsFlat50.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsFlat10K.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Deep10K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(deep10KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsDeep10K.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsDeep10K.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Deep50K----------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(deep50KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsDeep50.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsDeep50.toString() << std::endl;
-
-  std::cout
-      << "----------------------------------Struct1K---------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << succinctMicros(stuct1KWallUs) << std::endl;
-  std::cout << "PartitionOutput: " << partitionedOutputStatsStruct1K.toString()
-            << std::endl;
-  std::cout << "Exchange: " << exchangeStatsStruct1K.toString() << std::endl;
-
-  std::cout
-      << "--------------------------------LocalFlat10K-------------------------------"
-      << std::endl;
-  std::cout << "Wall Time (ms): " << "\n Total: "
-            << succinctMicros(localPartitionWallUs)
-            << "\n Max: " << localPartitionWaitStats.wallMs.back()
-            << "\n Median: "
-            << localPartitionWaitStats
-                   .wallMs[localPartitionWaitStats.wallMs.size() / 2]
-            << "\n Min: " << localPartitionWaitStats.wallMs.front()
-            << std::endl;
-  std::cout << "LocalPartition: " << localPartitionStatsFlat10K.toString()
-            << std::endl;
-  sortByAndPrintMax(
-      "Producer Wait Time (ms)",
-      localPartitionWaitStats.totalProducerWaitMs,
-      localPartitionWaitStats.producerWaitMs);
-  sortByAndPrintMax(
-      "Consumer Wait Time (ms)",
-      localPartitionWaitStats.totalConsumerWaitMs,
-      localPartitionWaitStats.consumerWaitMs);
-  std::sort(
-      localPartitionWaitStats.wallMs.begin(),
-      localPartitionWaitStats.wallMs.end());
-  assert(!localPartitionWaitStats.wallMs.empty());
-}
+// ── Benchmarks: input spec × nullPct × mode ───────────────────────────────
+
+#define EXCHANGE_BENCHMARK_INPUT(                                     \
+    _case_name, _input_expr, _mode_name, _dict_pct, _null_pct, _mode) \
+  EXCHANGE_BENCHMARK_NAMED_PARAM(                                     \
+      benchmarkExchange,                                              \
+      _case_name##_dict##_dict_pct##_null##_null_pct##_##_mode_name,  \
+      _input_expr,                                                    \
+      ExchangeMode::_mode,                                            \
+      _dict_pct,                                                      \
+      _null_pct)
+
+#define EXCHANGE_BENCHMARK_MODES(                                      \
+    _case_name, _input_expr, _dict_pct, _null_pct)                     \
+  EXCHANGE_BENCHMARK_INPUT(                                            \
+      _case_name, _input_expr, normal, _dict_pct, _null_pct, kNormal); \
+  EXCHANGE_BENCHMARK_INPUT(                                            \
+      _case_name, _input_expr, optimized, _dict_pct, _null_pct, kOptimized)
+
+#define EXCHANGE_BENCHMARK_CASE(_case_name, _input_expr)    \
+  EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 0);  \
+  EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 50); \
+  EXCHANGE_BENCHMARK_MODES(_case_name, _input_expr, 0, 100)
+
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Boolean_col1,
+    makeInputSpec(SimpleColType::kBoolean, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Boolean_col4,
+    makeInputSpec(SimpleColType::kBoolean, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Boolean_col16,
+    makeInputSpec(SimpleColType::kBoolean, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Tinyint_col1,
+    makeInputSpec(SimpleColType::kTinyint, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Tinyint_col4,
+    makeInputSpec(SimpleColType::kTinyint, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Tinyint_col16,
+    makeInputSpec(SimpleColType::kTinyint, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Integer_col1,
+    makeInputSpec(SimpleColType::kInteger, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Integer_col4,
+    makeInputSpec(SimpleColType::kInteger, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Integer_col16,
+    makeInputSpec(SimpleColType::kInteger, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Bigint_col1,
+    makeInputSpec(SimpleColType::kBigint, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Bigint_col4,
+    makeInputSpec(SimpleColType::kBigint, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Bigint_col16,
+    makeInputSpec(SimpleColType::kBigint, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Hugeint_col1,
+    makeInputSpec(SimpleColType::kHugeint, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Hugeint_col4,
+    makeInputSpec(SimpleColType::kHugeint, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Hugeint_col16,
+    makeInputSpec(SimpleColType::kHugeint, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_LongDecimal_col1,
+    makeInputSpec(SimpleColType::kLongDecimal, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_LongDecimal_col4,
+    makeInputSpec(SimpleColType::kLongDecimal, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_LongDecimal_col16,
+    makeInputSpec(SimpleColType::kLongDecimal, 16));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Double_col1,
+    makeInputSpec(SimpleColType::kDouble, 1));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Double_col4,
+    makeInputSpec(SimpleColType::kDouble, 4));
+EXCHANGE_BENCHMARK_CASE(
+    Simple10K_Double_col16,
+    makeInputSpec(SimpleColType::kDouble, 16));
+
+// The complex type benchmarks are temporarily disabled.
+// EXCHANGE_BENCHMARK_CASE(Deep10K, makeInputSpec(ExchangeInputKind::kDeep10K));
+// EXCHANGE_BENCHMARK_CASE(Deep50, makeInputSpec(ExchangeInputKind::kDeep50));
+// EXCHANGE_BENCHMARK_CASE(Struct1K,
+// makeInputSpec(ExchangeInputKind::kStruct1K));
+
+#undef EXCHANGE_BENCHMARK_CASE
+#undef EXCHANGE_BENCHMARK_MODES
+#undef EXCHANGE_BENCHMARK_INPUT
+#undef EXCHANGE_BENCHMARK_NAMED_PARAM
 
 } // namespace
 
@@ -605,7 +743,8 @@ int main(int argc, char** argv) {
   exec::ExchangeSource::registerFactory(exec::test::createLocalExchangeSource);
 
   bm = std::make_unique<ExchangeBenchmark>();
-  runBenchmarks();
+  folly::runBenchmarks();
+  printAllExchangeStats();
   bm.reset();
 
   return 0;
diff --git a/velox/exec/benchmarks/LocalExchangeBenchmark.cpp b/velox/exec/benchmarks/LocalExchangeBenchmark.cpp
new file mode 100644
index 00000000000..50b7637fd92
--- /dev/null
+++ b/velox/exec/benchmarks/LocalExchangeBenchmark.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include <algorithm>
+#include <mutex>
+#include <thread>
+
+#include "velox/core/QueryConfig.h"
+#include "velox/dwio/common/tests/utils/BatchMaker.h"
+#include "velox/exec/PlanNodeStats.h"
+#include "velox/exec/tests/utils/AssertQueryBuilder.h"
+#include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h"
+#include "velox/functions/prestosql/registration/RegistrationFunctions.h"
+#include "velox/parse/TypeResolver.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+DEFINE_int32(width, 16, "Number of drivers in each local exchange task");
+DEFINE_int32(num_local_tasks, 8, "Number of concurrent local shuffles");
+DEFINE_int32(num_local_repeat, 8, "Number of repeats of local exchange query");
+DEFINE_int32(flat_batch_mb, 1, "MB in a 10k row flat batch.");
+DEFINE_int64(
+    local_exchange_buffer_mb,
+    32,
+    "task-wide buffer in local exchange");
+DEFINE_int32(dict_pct, 0, "Percentage of columns wrapped in dictionary");
+// Add the following definitions to allow Clion runs
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+struct LocalPartitionWaitStats {
+  int64_t totalProducerWaitMs = 0;
+  int64_t totalConsumerWaitMs = 0;
+  std::vector<RuntimeMetric> consumerWaitMs;
+  std::vector<RuntimeMetric> producerWaitMs;
+  std::vector<int64_t> wallMs;
+};
+
+void sortByMax(std::vector<RuntimeMetric>& metrics) {
+  std::sort(
+      metrics.begin(),
+      metrics.end(),
+      [](const RuntimeMetric& left, const RuntimeMetric& right) {
+        return left.max > right.max;
+      });
+}
+
+void sortByAndPrintMax(
+    const char* title,
+    int64_t total,
+    std::vector<RuntimeMetric>& metrics) {
+  sortByMax(metrics);
+  VELOX_CHECK(!metrics.empty());
+  std::cout << title << "\n Total " << succinctNanos(total)
+            << "\n Max: " << metrics.front().toString()
+            << "\n Median: " << metrics[metrics.size() / 2].toString()
+            << "\n Min: " << metrics.back().toString() << std::endl;
+}
+
+class LocalExchangeBenchmark : public VectorTestBase {
+ public:
+  std::vector<RowVectorPtr> makeRows(
+      RowTypePtr type,
+      int32_t numVectors,
+      int32_t rowsPerVector,
+      int32_t dictPct = 0) {
+    std::vector<RowVectorPtr> vectors;
+    BufferPtr indices;
+    for (int32_t i = 0; i < numVectors; ++i) {
+      auto vector = std::dynamic_pointer_cast<RowVector>(
+          BatchMaker::createBatch(type, rowsPerVector, *pool_));
+
+      auto width = vector->childrenSize();
+      for (auto child = 0; child < width; ++child) {
+        if (100 * child / width > dictPct) {
+          if (!indices) {
+            indices = makeIndices(vector->size(), [&](auto i) { return i; });
+          }
+          vector->childAt(child) = BaseVector::wrapInDictionary(
+              nullptr, indices, vector->size(), vector->childAt(child));
+        }
+      }
+      vectors.push_back(vector);
+    }
+    return vectors;
+  }
+
+  void runLocal(
+      std::vector<RowVectorPtr>& vectors,
+      int32_t taskWidth,
+      int32_t numTasks,
+      int64_t& localPartitionWallUs,
+      PlanNodeStats& partitionedOutputStats,
+      LocalPartitionWaitStats& localPartitionWaitStats) {
+    VELOX_CHECK(!vectors.empty());
+
+    core::PlanNodePtr plan;
+    core::PlanNodeId localPartitionId1;
+    core::PlanNodeId localPartitionId2;
+    std::vector<std::shared_ptr<Task>> tasks;
+    std::vector<std::thread> threads;
+
+    RowVectorPtr expected;
+
+    BENCHMARK_SUSPEND {
+      std::vector<std::string> aggregates = {"count(1)"};
+      auto& rowType = vectors[0]->type()->as<TypeKind::ROW>();
+      for (auto i = 1; i < rowType.size(); ++i) {
+        aggregates.push_back(fmt::format("checksum({})", rowType.nameOf(i)));
+      }
+
+      // plan: Agg/kSingle(4) <-- LocalPartition/Gather(3) <-- Agg/kGather(2)
+      // <-- LocalPartition/kRepartition(1) <-- Values(0)
+      plan = exec::test::PlanBuilder()
+                 .values(vectors, true)
+                 .localPartition({"c0"})
+                 .capturePlanNodeId(localPartitionId1)
+                 .singleAggregation({}, aggregates)
+                 .localPartition(std::vector<std::string>{})
+                 .capturePlanNodeId(localPartitionId2)
+                 .singleAggregation({}, {"sum(a0)"})
+                 .planNode();
+
+      threads.reserve(numTasks);
+      expected = makeRowVector({makeFlatVector<int64_t>(1, [&](auto /*row*/) {
+        return vectors.size() * vectors[0]->size() * taskWidth;
+      })});
+    };
+
+    const auto startMicros = getCurrentTimeMicro();
+    std::mutex mutex;
+    for (int32_t i = 0; i < numTasks; ++i) {
+      threads.push_back(std::thread([&]() {
+        for (auto repeat = 0; repeat < FLAGS_num_local_repeat; ++repeat) {
+          auto task =
+              exec::test::AssertQueryBuilder(plan)
+                  .config(
+                      core::QueryConfig::kMaxLocalExchangeBufferSize,
+                      fmt::format("{}", FLAGS_local_exchange_buffer_mb << 20))
+                  .maxDrivers(taskWidth)
+                  .assertResults(expected);
+          {
+            std::lock_guard<std::mutex> l(mutex);
+            tasks.push_back(task);
+          }
+        }
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    BENCHMARK_SUSPEND {
+      localPartitionWallUs = getCurrentTimeMicro() - startMicros;
+
+      std::vector<core::PlanNodeId> localPartitionNodeIds{
+          localPartitionId1, localPartitionId2};
+
+      localPartitionWaitStats.totalProducerWaitMs = 0;
+      localPartitionWaitStats.totalConsumerWaitMs = 0;
+      for (const auto& task : tasks) {
+        const auto taskStats = task->taskStats();
+        localPartitionWaitStats.wallMs.push_back(
+            taskStats.executionEndTimeMs - taskStats.executionStartTimeMs);
+        const auto planStats = toPlanStats(taskStats);
+
+        for (const auto& nodeId : localPartitionNodeIds) {
+          const auto planStatsIt = planStats.find(nodeId);
+          if (planStatsIt == planStats.end()) {
+            continue;
+          }
+          const auto& taskLocalPartitionStats = planStatsIt->second;
+          partitionedOutputStats += taskLocalPartitionStats;
+
+          const auto& runtimeStats = taskLocalPartitionStats.customStats;
+          const auto producerWaitIt =
+              runtimeStats.find("blockedWaitForProducerWallNanos");
+          const auto consumerWaitIt =
+              runtimeStats.find("blockedWaitForConsumerWallNanos");
+          const RuntimeMetric producerWait =
+              producerWaitIt == runtimeStats.end() ? RuntimeMetric{}
+                                                   : producerWaitIt->second;
+          const RuntimeMetric consumerWait =
+              consumerWaitIt == runtimeStats.end() ? RuntimeMetric{}
+                                                   : consumerWaitIt->second;
+          localPartitionWaitStats.producerWaitMs.push_back(producerWait);
+          localPartitionWaitStats.consumerWaitMs.push_back(consumerWait);
+          localPartitionWaitStats.totalProducerWaitMs +=
+              localPartitionWaitStats.producerWaitMs.back().sum;
+          localPartitionWaitStats.totalConsumerWaitMs +=
+              localPartitionWaitStats.consumerWaitMs.back().sum;
+        }
+      }
+    };
+  }
+};
+
+std::unique_ptr<LocalExchangeBenchmark> bm;
+
+void runBenchmarks() {
+  std::vector<std::string> flatNames = {"c0"};
+  std::vector<TypePtr> flatTypes = {BIGINT()};
+  std::vector<TypePtr> typeSelection = {
+      BOOLEAN(),
+      TINYINT(),
+      DECIMAL(20, 3),
+      INTEGER(),
+      BIGINT(),
+      REAL(),
+      DECIMAL(10, 2),
+      DOUBLE(),
+      VARCHAR()};
+
+  int64_t flatSize = 0;
+  // Add enough columns of different types to make a 10K row batch be
+  // flat_batch_mb in flat size.
+  while (flatSize * 10000 < static_cast<int64_t>(FLAGS_flat_batch_mb) << 20) {
+    flatNames.push_back(fmt::format("c{}", flatNames.size()));
+    flatTypes.push_back(typeSelection[flatTypes.size() % typeSelection.size()]);
+    if (flatTypes.back()->isFixedWidth()) {
+      flatSize += flatTypes.back()->cppSizeInBytes();
+    } else {
+      flatSize += 20;
+    }
+  }
+  auto flatType = ROW(std::move(flatNames), std::move(flatTypes));
+  std::vector<RowVectorPtr> flat10k(
+      bm->makeRows(flatType, 10, 10000, FLAGS_dict_pct));
+
+  int64_t localPartitionWallUs;
+  PlanNodeStats localPartitionStatsFlat10K;
+  LocalPartitionWaitStats localPartitionWaitStats;
+  folly::addBenchmark(__FILE__, "localFlat10k", [&]() {
+    bm->runLocal(
+        flat10k,
+        FLAGS_width,
+        FLAGS_num_local_tasks,
+        localPartitionWallUs,
+        localPartitionStatsFlat10K,
+        localPartitionWaitStats);
+    return 1;
+  });
+
+  folly::runBenchmarks();
+
+  std::sort(
+      localPartitionWaitStats.wallMs.begin(),
+      localPartitionWaitStats.wallMs.end());
+  VELOX_CHECK(!localPartitionWaitStats.wallMs.empty());
+
+  std::cout
+      << "--------------------------------LocalFlat10K-------------------------------"
+      << std::endl;
+  std::cout << "Wall Time (ms): " << "\n Total: "
+            << succinctMicros(localPartitionWallUs)
+            << "\n Max: " << localPartitionWaitStats.wallMs.back()
+            << "\n Median: "
+            << localPartitionWaitStats
+                   .wallMs[localPartitionWaitStats.wallMs.size() / 2]
+            << "\n Min: " << localPartitionWaitStats.wallMs.front()
+            << std::endl;
+  std::cout << "LocalPartition: " << localPartitionStatsFlat10K.toString()
+            << std::endl;
+  sortByAndPrintMax(
+      "Producer Wait Time (ms)",
+      localPartitionWaitStats.totalProducerWaitMs,
+      localPartitionWaitStats.producerWaitMs);
+  sortByAndPrintMax(
+      "Consumer Wait Time (ms)",
+      localPartitionWaitStats.totalConsumerWaitMs,
+      localPartitionWaitStats.consumerWaitMs);
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  functions::prestosql::registerAllScalarFunctions();
+  aggregate::prestosql::registerAllAggregateFunctions();
+  parse::registerTypeResolver();
+
+  bm = std::make_unique<LocalExchangeBenchmark>();
+  runBenchmarks();
+  bm.reset();
+
+  return 0;
+}
diff --git a/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp b/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp
new file mode 100644
index 00000000000..3d2635fda94
--- /dev/null
+++ b/velox/exec/benchmarks/OptimizedHashPartitionFunctionBenchmark.cpp
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/tests/utils/VectorMaker.h"
+
+// Add the following definitions to allow Clion runs.
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+constexpr vector_size_t kSize = 10'000;
+constexpr vector_size_t kDictionarySize = kSize / 5;
+
+enum class FunctionKind {
+  kNormal,
+  kOptimized,
+};
+
+enum class EncodingMode {
+  kFlat,
+  kDictionary,
+  kConstant,
+};
+
+enum class NullMode {
+  kNoNulls,
+  kHalfNulls,
+  kAllNulls,
+};
+
+enum class PartitionMode {
+  kRemote,
+  kLocalExchange,
+  kHashBitRangeFirst8,
+  kHashBitRangeLast8,
+};
+
+template <typename T>
+T makeValue(vector_size_t row) {
+  return static_cast<T>((row * 8191) ^ (row >> 3));
+}
+
+template <>
+bool makeValue<bool>(vector_size_t row) {
+  return (row & 1) == 0;
+}
+
+template <>
+StringView makeValue<StringView>(vector_size_t row) {
+  thread_local std::array<char, 20> buffer;
+  const auto length = 5 + row % 16;
+  for (vector_size_t index = 0; index < length; ++index) {
+    buffer[index] = 'a' + (row + index * 7) % 26;
+  }
+  return StringView(buffer.data(), length);
+}
+
+std::function<bool(vector_size_t)> makeNulls(NullMode nullMode) {
+  switch (nullMode) {
+    case NullMode::kNoNulls:
+      return nullptr;
+    case NullMode::kHalfNulls:
+      return [](vector_size_t row) { return (row & 1) == 0; };
+    case NullMode::kAllNulls:
+      return [](vector_size_t /*row*/) { return true; };
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+VectorPtr wrapInDictionary(
+    const VectorPtr& base,
+    vector_size_t size,
+    memory::MemoryPool* pool,
+    NullMode nullMode = NullMode::kNoNulls) {
+  auto indices = AlignedBuffer::allocate<vector_size_t>(size, pool);
+  auto* rawIndices = indices->asMutable<vector_size_t>();
+  const auto baseSize = base->size();
+  for (vector_size_t row = 0; row < size; ++row) {
+    rawIndices[row] = (size - row - 1) % baseSize;
+  }
+
+  BufferPtr nulls;
+  if (nullMode == NullMode::kHalfNulls) {
+    nulls = AlignedBuffer::allocate<bool>(size, pool);
+    auto* rawNulls = nulls->asMutable<uint64_t>();
+    bits::fillBits(rawNulls, 0, size, bits::kNotNull);
+    for (vector_size_t row = 0; row < size; row += 2) {
+      bits::setNull(rawNulls, row);
+    }
+  } else if (nullMode == NullMode::kAllNulls) {
+    nulls = AlignedBuffer::allocate<bool>(size, pool);
+    auto* rawNulls = nulls->asMutable<uint64_t>();
+    bits::fillBits(rawNulls, 0, size, bits::kNull);
+  }
+
+  return BaseVector::wrapInDictionary(nulls, indices, size, base);
+}
+
+template <typename T>
+VectorPtr makeValuesVector(
+    VectorMaker& vectorMaker,
+    memory::MemoryPool* pool,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    vector_size_t size) {
+  const auto flatSize =
+      encodingMode == EncodingMode::kDictionary ? kDictionarySize : size;
+  auto flat = vectorMaker.flatVector<T>(
+      flatSize,
+      [](vector_size_t row) { return makeValue<T>(row); },
+      makeNulls(nullMode));
+
+  switch (encodingMode) {
+    case EncodingMode::kFlat:
+      return flat;
+    case EncodingMode::kDictionary:
+      return wrapInDictionary(flat, size, pool);
+    case EncodingMode::kConstant:
+      if (nullMode == NullMode::kAllNulls) {
+        return BaseVector::createNullConstant(
+            CppToType<T>::create(), size, pool);
+      }
+      if (nullMode == NullMode::kHalfNulls) {
+        auto constant = BaseVector::wrapInConstant(size, 1, flat);
+        // ConstantVector has one nullness for all logical rows. Use a
+        // dictionary wrapper to express alternating nulls while keeping the
+        // repeated-value payload constant.
+        return wrapInDictionary(constant, size, pool, nullMode);
+      }
+      return BaseVector::wrapInConstant(size, 0, flat);
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+template <FunctionKind Kind>
+std::unique_ptr<HashPartitionFunctionBase> makePartitionFunction(
+    PartitionMode partitionMode,
+    const RowTypePtr& inputType,
+    int numPartitions) {
+  switch (partitionMode) {
+    case PartitionMode::kRemote:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            false, numPartitions, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            false, numPartitions, inputType, std::vector<column_index_t>{0});
+      }
+    case PartitionMode::kLocalExchange:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            true, numPartitions, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            true, numPartitions, inputType, std::vector<column_index_t>{0});
+      }
+    case PartitionMode::kHashBitRangeFirst8:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            HashBitRange{0, 8}, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            HashBitRange{0, 8}, inputType, std::vector<column_index_t>{0});
+      }
+    case PartitionMode::kHashBitRangeLast8:
+      if constexpr (Kind == FunctionKind::kNormal) {
+        return std::make_unique<HashPartitionFunction>(
+            HashBitRange{56, 64}, inputType, std::vector<column_index_t>{0});
+      } else {
+        return std::make_unique<OptimizedHashPartitionFunction>(
+            HashBitRange{56, 64}, inputType, std::vector<column_index_t>{0});
+      }
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+void normalRangeReduction(
+    const uint64_t* hashes,
+    uint32_t* partitions,
+    int size,
+    uint32_t numPartitions) {
+  for (int index = 0; index < size; ++index) {
+    partitions[index] = hashes[index] % numPartitions;
+  }
+}
+
+template <FunctionKind Kind>
+void runRangeReductionBenchmark(uint32_t iterations, uint32_t numPartitions) {
+  folly::BenchmarkSuspender suspender;
+
+  std::vector<uint64_t> hashes(kSize);
+  std::vector<uint32_t> partitions(kSize);
+  for (vector_size_t row = 0; row < kSize; ++row) {
+    hashes[row] = (static_cast<uint64_t>(row * 8191) << 32) ^
+        static_cast<uint64_t>(row * 1315423911ULL + 17);
+  }
+
+  suspender.dismiss();
+
+  for (uint32_t iteration = 0; iteration < iterations; ++iteration) {
+    if constexpr (Kind == FunctionKind::kNormal) {
+      normalRangeReduction(
+          hashes.data(), partitions.data(), kSize, numPartitions);
+    } else {
+      rangeReduction(hashes.data(), partitions.data(), kSize, numPartitions);
+    }
+    folly::doNotOptimizeAway(partitions.data());
+  }
+}
+
+template <typename T, FunctionKind Kind>
+void runPartitionBenchmark(
+    uint32_t iterations,
+    PartitionMode partitionMode,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    int numPartitions) {
+  folly::BenchmarkSuspender suspender;
+
+  auto pool = memory::memoryManager()->addLeafPool();
+  VectorMaker vectorMaker(pool.get());
+  auto values = makeValuesVector<T>(
+      vectorMaker, pool.get(), encodingMode, nullMode, kSize);
+  auto input = vectorMaker.rowVector({values});
+  auto partitionFunction = makePartitionFunction<Kind>(
+      partitionMode, asRowType(input->type()), numPartitions);
+  std::vector<uint32_t> partitions;
+
+  suspender.dismiss();
+
+  for (uint32_t iteration = 0; iteration < iterations; ++iteration) {
+    std::optional<uint32_t> singlePartition =
+        partitionFunction->partition(*input, partitions);
+    if (singlePartition.has_value()) {
+      std::fill(partitions.begin(), partitions.end(), singlePartition.value());
+    }
+    folly::doNotOptimizeAway(partitions.data());
+  }
+}
+
+template <typename T>
+void benchmarkNormalHashPartitionFunction(
+    uint32_t iterations,
+    PartitionMode partitionMode,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    int numPartitions) {
+  runPartitionBenchmark<T, FunctionKind::kNormal>(
+      iterations, partitionMode, encodingMode, nullMode, numPartitions);
+}
+
+template <typename T>
+void benchmarkOptimizedHashPartitionFunction(
+    uint32_t iterations,
+    PartitionMode partitionMode,
+    EncodingMode encodingMode,
+    NullMode nullMode,
+    int numPartitions) {
+  runPartitionBenchmark<T, FunctionKind::kOptimized>(
+      iterations, partitionMode, encodingMode, nullMode, numPartitions);
+}
+
+#define REGISTER_PARTITION_PAIR(                                                                                  \
+    T,                                                                                                            \
+    TYPE_NAME,                                                                                                    \
+    PARTITION_MODE,                                                                                               \
+    PARTITION_NAME,                                                                                               \
+    NUM_PARTITIONS,                                                                                               \
+    NUM_PARTITIONS_NAME,                                                                                          \
+    ENCODING_MODE,                                                                                                \
+    ENCODING_NAME,                                                                                                \
+    NULL_MODE,                                                                                                    \
+    NULL_NAME)                                                                                                    \
+  BENCHMARK(                                                                                                      \
+      partition_##TYPE_NAME##_##PARTITION_NAME##_##NUM_PARTITIONS_NAME##_##ENCODING_NAME##_##NULL_NAME,           \
+      iterations) {                                                                                               \
+    benchmarkNormalHashPartitionFunction<T>(                                                                      \
+        iterations, PARTITION_MODE, ENCODING_MODE, NULL_MODE, NUM_PARTITIONS);                                    \
+  }                                                                                                               \
+  BENCHMARK_RELATIVE(                                                                                             \
+      optimized_partition_##TYPE_NAME##_##PARTITION_NAME##_##NUM_PARTITIONS_NAME##_##ENCODING_NAME##_##NULL_NAME, \
+      iterations) {                                                                                               \
+    benchmarkOptimizedHashPartitionFunction<T>(                                                                   \
+        iterations, PARTITION_MODE, ENCODING_MODE, NULL_MODE, NUM_PARTITIONS);                                    \
+  }                                                                                                               \
+  BENCHMARK_DRAW_LINE();
+
+#define REGISTER_PARTITION_NULL_MODES( \
+    T,                                 \
+    TYPE_NAME,                         \
+    PARTITION_MODE,                    \
+    PARTITION_NAME,                    \
+    NUM_PARTITIONS,                    \
+    NUM_PARTITIONS_NAME,               \
+    ENCODING_MODE,                     \
+    ENCODING_NAME)                     \
+  REGISTER_PARTITION_PAIR(             \
+      T,                               \
+      TYPE_NAME,                       \
+      PARTITION_MODE,                  \
+      PARTITION_NAME,                  \
+      NUM_PARTITIONS,                  \
+      NUM_PARTITIONS_NAME,             \
+      ENCODING_MODE,                   \
+      ENCODING_NAME,                   \
+      NullMode::kNoNulls,              \
+      no_null)                         \
+  REGISTER_PARTITION_PAIR(             \
+      T,                               \
+      TYPE_NAME,                       \
+      PARTITION_MODE,                  \
+      PARTITION_NAME,                  \
+      NUM_PARTITIONS,                  \
+      NUM_PARTITIONS_NAME,             \
+      ENCODING_MODE,                   \
+      ENCODING_NAME,                   \
+      NullMode::kHalfNulls,            \
+      half_null)                       \
+  REGISTER_PARTITION_PAIR(             \
+      T,                               \
+      TYPE_NAME,                       \
+      PARTITION_MODE,                  \
+      PARTITION_NAME,                  \
+      NUM_PARTITIONS,                  \
+      NUM_PARTITIONS_NAME,             \
+      ENCODING_MODE,                   \
+      ENCODING_NAME,                   \
+      NullMode::kAllNulls,             \
+      all_null)
+
+#define REGISTER_PARTITION_ENCODINGS( \
+    T,                                \
+    TYPE_NAME,                        \
+    PARTITION_MODE,                   \
+    PARTITION_NAME,                   \
+    NUM_PARTITIONS,                   \
+    NUM_PARTITIONS_NAME)              \
+  REGISTER_PARTITION_NULL_MODES(      \
+      T,                              \
+      TYPE_NAME,                      \
+      PARTITION_MODE,                 \
+      PARTITION_NAME,                 \
+      NUM_PARTITIONS,                 \
+      NUM_PARTITIONS_NAME,            \
+      EncodingMode::kFlat,            \
+      flat)                           \
+  REGISTER_PARTITION_NULL_MODES(      \
+      T,                              \
+      TYPE_NAME,                      \
+      PARTITION_MODE,                 \
+      PARTITION_NAME,                 \
+      NUM_PARTITIONS,                 \
+      NUM_PARTITIONS_NAME,            \
+      EncodingMode::kDictionary,      \
+      dictionary)                     \
+  REGISTER_PARTITION_NULL_MODES(      \
+      T,                              \
+      TYPE_NAME,                      \
+      PARTITION_MODE,                 \
+      PARTITION_NAME,                 \
+      NUM_PARTITIONS,                 \
+      NUM_PARTITIONS_NAME,            \
+      EncodingMode::kConstant,        \
+      constant)
+
+#define REGISTER_PARTITION_COUNTS(                                \
+    T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME)                 \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1, p1)        \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 4, p4)        \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 16, p16)      \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 100, p100)    \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1'000, p1000) \
+  REGISTER_PARTITION_ENCODINGS(                                   \
+      T, TYPE_NAME, PARTITION_MODE, PARTITION_NAME, 1'024, p1024)
+
+#define REGISTER_PARTITION_MODES(T, TYPE_NAME)                            \
+  REGISTER_PARTITION_COUNTS(T, TYPE_NAME, PartitionMode::kRemote, remote) \
+  REGISTER_PARTITION_COUNTS(                                              \
+      T, TYPE_NAME, PartitionMode::kLocalExchange, local_exchange)        \
+  REGISTER_PARTITION_ENCODINGS(                                           \
+      T,                                                                  \
+      TYPE_NAME,                                                          \
+      PartitionMode::kHashBitRangeFirst8,                                 \
+      hashbits_0_8,                                                       \
+      0,                                                                  \
+      hashbits)                                                           \
+  REGISTER_PARTITION_ENCODINGS(                                           \
+      T,                                                                  \
+      TYPE_NAME,                                                          \
+      PartitionMode::kHashBitRangeLast8,                                  \
+      hashbits_last_8,                                                    \
+      0,                                                                  \
+      hashbits)
+
+REGISTER_PARTITION_MODES(bool, bool)
+REGISTER_PARTITION_MODES(int8_t, tinyint)
+REGISTER_PARTITION_MODES(int16_t, smallint)
+REGISTER_PARTITION_MODES(int32_t, integer)
+REGISTER_PARTITION_MODES(int64_t, bigint)
+REGISTER_PARTITION_MODES(StringView, varchar)
+
+#define REGISTER_RANGE_REDUCTION_PAIR(NUM_PARTITIONS, NUM_PARTITIONS_NAME) \
+  BENCHMARK(normal_range_reduction_##NUM_PARTITIONS_NAME, iterations) {    \
+    runRangeReductionBenchmark<FunctionKind::kNormal>(                     \
+        iterations, NUM_PARTITIONS);                                       \
+  }                                                                        \
+  BENCHMARK_RELATIVE(                                                      \
+      optimized_range_reduction_##NUM_PARTITIONS_NAME, iterations) {       \
+    runRangeReductionBenchmark<FunctionKind::kOptimized>(                  \
+        iterations, NUM_PARTITIONS);                                       \
+  }                                                                        \
+  BENCHMARK_DRAW_LINE();
+
+REGISTER_RANGE_REDUCTION_PAIR(1, p1)
+REGISTER_RANGE_REDUCTION_PAIR(4, p4)
+REGISTER_RANGE_REDUCTION_PAIR(16, p16)
+REGISTER_RANGE_REDUCTION_PAIR(100, p100)
+REGISTER_RANGE_REDUCTION_PAIR(1'000, p1000)
+REGISTER_RANGE_REDUCTION_PAIR(1'024, p1024)
+
+#undef REGISTER_PARTITION_MODES
+#undef REGISTER_PARTITION_COUNTS
+#undef REGISTER_PARTITION_ENCODINGS
+#undef REGISTER_PARTITION_NULL_MODES
+#undef REGISTER_PARTITION_PAIR
+#undef REGISTER_RANGE_REDUCTION_PAIR
+
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
new file mode 100644
index 00000000000..32fdc278857
--- /dev/null
+++ b/velox/exec/benchmarks/OptimizedVectorHasherBenchmark.cpp
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <numeric>
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/exec/OptimizedVectorHasher.h"
+#include "velox/exec/VectorHasher.h"
+#include "velox/type/HugeInt.h"
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/tests/utils/VectorMaker.h"
+
+// Add the following definitions to allow Clion runs.
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+enum class NullMode {
+  kNoNulls,
+  kHalfNulls,
+  kAllNulls,
+};
+
+enum class EncodingMode {
+  kFlat,
+  kDictionary,
+  kConstant,
+};
+
+template <typename T>
+T makeValue(vector_size_t row) {
+  return static_cast<T>((row * 8191) ^ (row >> 3));
+}
+
+template <>
+bool makeValue<bool>(vector_size_t row) {
+  return (row & 1) == 0;
+}
+
+template <>
+float makeValue<float>(vector_size_t row) {
+  return static_cast<float>(row) * 1.25f - 1000.0f;
+}
+
+template <>
+double makeValue<double>(vector_size_t row) {
+  return static_cast<double>(row) * 1.25 - 1000.0;
+}
+
+template <>
+int128_t makeValue<int128_t>(vector_size_t row) {
+  return HugeInt::build(
+      static_cast<int64_t>(row * 31),
+      static_cast<uint64_t>(row * 1315423911ULL + 17));
+}
+
+template <>
+StringView makeValue<StringView>(vector_size_t row) {
+  thread_local std::array<char, 20> buffer;
+  const auto length = 5 + row % 16;
+  for (vector_size_t i = 0; i < length; ++i) {
+    buffer[i] = 'a' + (row + i * 7) % 26;
+  }
+  return StringView(buffer.data(), length);
+}
+
+std::function<bool(vector_size_t)> makeNulls(NullMode nullMode) {
+  switch (nullMode) {
+    case NullMode::kNoNulls:
+      return nullptr;
+    case NullMode::kHalfNulls:
+      return [](vector_size_t row) { return (row & 1) == 0; };
+    case NullMode::kAllNulls:
+      return [](vector_size_t /*row*/) { return true; };
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+template <typename T>
+VectorPtr makeValuesVector(
+    VectorMaker& vectorMaker,
+    memory::MemoryPool* pool,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    vector_size_t numValues,
+    vector_size_t dictionarySize) {
+  auto flat = vectorMaker.flatVector<T>(
+      encodingMode == EncodingMode::kDictionary ? dictionarySize : numValues,
+      [](vector_size_t row) { return makeValue<T>(row); },
+      makeNulls(nullMode));
+
+  switch (encodingMode) {
+    case EncodingMode::kFlat:
+      return flat;
+    case EncodingMode::kDictionary: {
+      auto indices = AlignedBuffer::allocate<vector_size_t>(numValues, pool);
+      auto* rawIndices = indices->asMutable<vector_size_t>();
+      for (vector_size_t i = 0; i < numValues; ++i) {
+        rawIndices[i] = (numValues - i - 1) % dictionarySize;
+      }
+      return BaseVector::wrapInDictionary(
+          BufferPtr(nullptr), indices, numValues, flat);
+    }
+    case EncodingMode::kConstant:
+      if (nullMode == NullMode::kAllNulls) {
+        return BaseVector::createNullConstant(
+            CppToType<T>::create(), numValues, pool);
+      }
+      return BaseVector::wrapInConstant(numValues, 0, flat);
+  }
+
+  VELOX_UNREACHABLE();
+}
+
+template <typename Hasher>
+struct HasherRunner;
+
+template <>
+struct HasherRunner<VectorHasher> {
+  static std::unique_ptr<VectorHasher> create(const TypePtr& type) {
+    return VectorHasher::create(type, 0);
+  }
+};
+
+template <>
+struct HasherRunner<OptimizedVectorHasher> {
+  static std::unique_ptr<OptimizedVectorHasher> create(const TypePtr& type) {
+    return OptimizedVectorHasher::create(type, 0);
+  }
+};
+
+template <typename T, typename Hasher>
+void runHashBenchmark(
+    uint32_t iterations,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    bool mix,
+    vector_size_t size,
+    vector_size_t dictionarySize) {
+  folly::BenchmarkSuspender suspender;
+
+  auto pool = memory::memoryManager()->addLeafPool();
+  VectorMaker vectorMaker(pool.get());
+  auto values = makeValuesVector<T>(
+      vectorMaker, pool.get(), nullMode, encodingMode, size, dictionarySize);
+  auto hasher = HasherRunner<Hasher>::create(CppToType<T>::create());
+  raw_vector<uint64_t> hashes(size, pool.get());
+
+  SelectivityVector rows(size);
+  hasher->decode(*values, rows);
+  if (mix) {
+    std::iota(hashes.begin(), hashes.end(), 0);
+  }
+
+  suspender.dismiss();
+
+  for (uint32_t i = 0; i < iterations; ++i) {
+    hasher->hash(rows, mix, hashes);
+    folly::doNotOptimizeAway(hashes.data());
+  }
+}
+
+template <typename T>
+void benchmarkVectorHasher(
+    uint32_t iterations,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    bool mix,
+    vector_size_t size,
+    vector_size_t dictionarySize) {
+  runHashBenchmark<T, VectorHasher>(
+      iterations, nullMode, encodingMode, mix, size, dictionarySize);
+}
+
+template <typename T>
+void benchmarkOptimizedVectorHasher(
+    uint32_t iterations,
+    NullMode nullMode,
+    EncodingMode encodingMode,
+    bool mix,
+    vector_size_t size,
+    vector_size_t dictionarySize) {
+  runHashBenchmark<T, OptimizedVectorHasher>(
+      iterations, nullMode, encodingMode, mix, size, dictionarySize);
+}
+
+#define REGISTER_HASHER_PAIR(                                                  \
+    T,                                                                         \
+    TYPE_NAME,                                                                 \
+    NULL_MODE,                                                                 \
+    NULL_NAME,                                                                 \
+    ENCODING_MODE,                                                             \
+    ENCODING_NAME,                                                             \
+    MIX,                                                                       \
+    MIX_NAME,                                                                  \
+    SIZE,                                                                      \
+    DICTIONARY_SIZE)                                                           \
+  BENCHMARK(TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME, n) {       \
+    benchmarkVectorHasher<T>(                                                  \
+        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);              \
+  }                                                                            \
+  BENCHMARK_RELATIVE(                                                          \
+      optimized_##TYPE_NAME##_##ENCODING_NAME##_##NULL_NAME##_##MIX_NAME, n) { \
+    benchmarkOptimizedVectorHasher<T>(                                         \
+        n, NULL_MODE, ENCODING_MODE, MIX, SIZE, DICTIONARY_SIZE);              \
+  }                                                                            \
+  BENCHMARK_DRAW_LINE();
+
+#define REGISTER_HASHER_NULL_MODES( \
+    T,                              \
+    TYPE_NAME,                      \
+    ENCODING_MODE,                  \
+    ENCODING_NAME,                  \
+    MIX,                            \
+    MIX_NAME,                       \
+    SIZE,                           \
+    DICTIONARY_SIZE)                \
+  REGISTER_HASHER_PAIR(             \
+      T,                            \
+      TYPE_NAME,                    \
+      NullMode::kNoNulls,           \
+      no_null,                      \
+      ENCODING_MODE,                \
+      ENCODING_NAME,                \
+      MIX,                          \
+      MIX_NAME,                     \
+      SIZE,                         \
+      DICTIONARY_SIZE)              \
+  REGISTER_HASHER_PAIR(             \
+      T,                            \
+      TYPE_NAME,                    \
+      NullMode::kHalfNulls,         \
+      half_null,                    \
+      ENCODING_MODE,                \
+      ENCODING_NAME,                \
+      MIX,                          \
+      MIX_NAME,                     \
+      SIZE,                         \
+      DICTIONARY_SIZE)              \
+  REGISTER_HASHER_PAIR(             \
+      T,                            \
+      TYPE_NAME,                    \
+      NullMode::kAllNulls,          \
+      all_null,                     \
+      ENCODING_MODE,                \
+      ENCODING_NAME,                \
+      MIX,                          \
+      MIX_NAME,                     \
+      SIZE,                         \
+      DICTIONARY_SIZE)
+
+#define REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, SIZE) \
+  REGISTER_HASHER_PAIR(                                                        \
+      T,                                                                       \
+      TYPE_NAME,                                                               \
+      NullMode::kNoNulls,                                                      \
+      no_null,                                                                 \
+      EncodingMode::kConstant,                                                 \
+      constant,                                                                \
+      MIX,                                                                     \
+      MIX_NAME,                                                                \
+      SIZE,                                                                    \
+      SIZE)                                                                    \
+  REGISTER_HASHER_PAIR(                                                        \
+      T,                                                                       \
+      TYPE_NAME,                                                               \
+      NullMode::kAllNulls,                                                     \
+      all_null,                                                                \
+      EncodingMode::kConstant,                                                 \
+      constant,                                                                \
+      MIX,                                                                     \
+      MIX_NAME,                                                                \
+      SIZE,                                                                    \
+      SIZE)
+
+#define REGISTER_HASHER_SIZES(                                 \
+    T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_NULL_MODES(                                  \
+      T, TYPE_NAME, ENCODING_MODE, ENCODING_NAME, MIX, MIX_NAME, 10000, 10000)
+
+#define REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_NULL_MODES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME, 10000)
+
+#define REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(         \
+    T, TYPE_NAME, MIX, MIX_NAME, SIZE, PERCENT, PERCENT_NAME) \
+  REGISTER_HASHER_NULL_MODES(                                 \
+      T,                                                      \
+      TYPE_NAME,                                              \
+      EncodingMode::kDictionary,                              \
+      dictionary_##PERCENT_NAME,                              \
+      MIX,                                                    \
+      MIX_NAME,                                               \
+      SIZE,                                                   \
+      SIZE* PERCENT / 100)
+
+#define REGISTER_HASHER_SIZES_DICTIONARY(T, TYPE_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 80, 80pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 60, 60pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 40, 40pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 20, 20pct)                  \
+  REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT(                       \
+      T, TYPE_NAME, MIX, MIX_NAME, 10000, 5, 5pct)
+
+#define REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, MIX, MIX_NAME)  \
+  REGISTER_HASHER_SIZES(                                        \
+      T, TYPE_NAME, EncodingMode::kFlat, flat, MIX, MIX_NAME)   \
+  REGISTER_HASHER_SIZES_DICTIONARY(T, TYPE_NAME, MIX, MIX_NAME) \
+  REGISTER_HASHER_SIZES_CONSTANT(T, TYPE_NAME, MIX, MIX_NAME)
+
+#define REGISTER_HASHER_TYPE(T, TYPE_NAME)               \
+  REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, false, no_mix) \
+  REGISTER_HASHER_ENCODINGS(T, TYPE_NAME, true, mix)
+
+REGISTER_HASHER_TYPE(bool, boolean)
+REGISTER_HASHER_TYPE(int8_t, tinyint)
+REGISTER_HASHER_TYPE(int16_t, smallint)
+REGISTER_HASHER_TYPE(int32_t, integer)
+REGISTER_HASHER_TYPE(int64_t, bigint)
+REGISTER_HASHER_TYPE(int128_t, hugeint)
+REGISTER_HASHER_TYPE(float, real)
+REGISTER_HASHER_TYPE(double, double)
+REGISTER_HASHER_TYPE(StringView, varchar)
+
+#undef REGISTER_HASHER_TYPE
+#undef REGISTER_HASHER_SIZES_DICTIONARY
+#undef REGISTER_HASHER_SIZES_DICTIONARY_FOR_PERCENT
+#undef REGISTER_HASHER_SIZES
+#undef REGISTER_HASHER_NULL_MODES
+#undef REGISTER_HASHER_PAIR
+
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt
index a97d63ccd5b..189e7fc8680 100644
--- a/velox/exec/tests/CMakeLists.txt
+++ b/velox/exec/tests/CMakeLists.txt
@@ -65,6 +65,7 @@ set(
   EnforceDistinctTest.cpp
   TraceUtilTest.cpp
   HashPartitionFunctionTest.cpp
+  OptimizedHashPartitionFunctionTest.cpp
   SpatialIndexTest.cpp
   ValuesTest.cpp
   ParallelProjectTest.cpp
@@ -148,6 +149,8 @@ set(
   AssignUniqueIdTest.cpp
   FilterProjectTest.cpp
   AsyncConnectorTest.cpp
+  OptimizedPartitionedOutputTest.cpp
+  OptimizedVectorHasherTest.cpp
 )
 
 set(
diff --git a/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp b/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp
new file mode 100644
index 00000000000..b9d6b193159
--- /dev/null
+++ b/velox/exec/tests/OptimizedHashPartitionFunctionTest.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/exec/OptimizedHashPartitionFunction.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+
+class OptimizedHashPartitionFunctionTest : public velox::test::VectorTestBase,
+                                           public testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+  }
+};
+
+TEST_F(
+    OptimizedHashPartitionFunctionTest,
+    powerOfTwoRangeReductionMatchesMultiplyHigh) {
+  const std::vector<uint64_t> hashes = {
+      0,
+      1,
+      0x0000'0001'0000'0000ULL,
+      0x1234'5678'9abc'def0ULL,
+      0xffff'ffff'ffff'ffffULL,
+  };
+
+  for (const auto numPartitions : {1, 2, 4, 1'024}) {
+    std::vector<uint32_t> partitions(hashes.size());
+    rangeReduction(
+        hashes.data(),
+        partitions.data(),
+        static_cast<vector_size_t>(hashes.size()),
+        numPartitions);
+
+    std::vector<uint32_t> expected;
+    expected.reserve(hashes.size());
+    for (const auto hash : hashes) {
+      const auto mixedHash =
+          static_cast<uint32_t>(hash) ^ static_cast<uint32_t>(hash >> 32);
+      expected.push_back(
+          (static_cast<uint64_t>(mixedHash) * numPartitions) >> 32);
+    }
+
+    EXPECT_EQ(partitions, expected);
+  }
+}
+
+TEST_F(
+    OptimizedHashPartitionFunctionTest,
+    optimizedHashBitRangeMatchesRegular) {
+  const auto numRows = 10'000;
+  auto input = makeRowVector(
+      {makeNullableFlatVector<int64_t>([&] {
+         std::vector<std::optional<int64_t>> values;
+         values.reserve(numRows);
+         for (auto row = 0; row < numRows; ++row) {
+           values.emplace_back(
+               row % 17 == 0 ? std::nullopt : std::optional<int64_t>(row * 13));
+         }
+         return values;
+       }()),
+       makeFlatVector<StringView>(numRows, [](auto row) {
+         return StringView::makeInline(fmt::format("value_{}", row % 97));
+       })});
+  const auto rowType = asRowType(input->type());
+
+  HashPartitionFunction regular(HashBitRange{0, 5}, rowType, {0, 1});
+  OptimizedHashPartitionFunction optimized(HashBitRange{0, 5}, rowType, {0, 1});
+
+  std::vector<uint32_t> regularPartitions;
+  std::vector<uint32_t> optimizedPartitions;
+  EXPECT_EQ(
+      regular.partition(*input, regularPartitions),
+      optimized.partition(*input, optimizedPartitions));
+  EXPECT_EQ(regularPartitions, optimizedPartitions);
+}
+
+TEST_F(OptimizedHashPartitionFunctionTest, onePartitionReturnsConstantResult) {
+  auto input = makeRowVector({makeConstant(true, 10'000)});
+  const auto rowType = asRowType(input->type());
+  OptimizedHashPartitionFunction partitionFunction(
+      /*localExchange=*/true, 1, rowType, {0});
+
+  std::vector<uint32_t> partitions{123};
+  EXPECT_EQ(partitionFunction.partition(*input, partitions), 0u);
+  EXPECT_EQ(partitions, std::vector<uint32_t>{123});
+}
+
+TEST_F(OptimizedHashPartitionFunctionTest, emptyConstantKeyReturnsEmptyResult) {
+  auto input = makeRowVector({makeConstant(true, 0)});
+  const auto rowType = asRowType(input->type());
+  OptimizedHashPartitionFunction optimized(
+      /*localExchange=*/true, 16, rowType, {0});
+
+  std::vector<uint32_t> optimizedPartitions{123};
+  EXPECT_EQ(optimized.partition(*input, optimizedPartitions), std::nullopt);
+  EXPECT_TRUE(optimizedPartitions.empty());
+}
+
+TEST_F(OptimizedHashPartitionFunctionTest, specUsesConfiguredImplementation) {
+  auto input = makeRowVector(
+      {makeFlatVector<int32_t>({1, 2, 3, 4}),
+       makeFlatVector<StringView>({"a", "b", "c", "d"})});
+  const auto rowType = asRowType(input->type());
+  HashPartitionFunctionSpec spec(rowType, std::vector<column_index_t>{0, 1});
+  auto optimizedFunction = spec.create(8, /*localExchange=*/false, true);
+  ASSERT_NE(
+      dynamic_cast<OptimizedHashPartitionFunction*>(optimizedFunction.get()),
+      nullptr);
+
+  auto regularFunction = spec.create(8, /*localExchange=*/false);
+  ASSERT_NE(
+      dynamic_cast<HashPartitionFunction*>(regularFunction.get()), nullptr);
+
+  std::vector<uint32_t> optimizedPartitions;
+  ASSERT_EQ(
+      optimizedFunction->partition(*input, optimizedPartitions), std::nullopt);
+  ASSERT_EQ(optimizedPartitions.size(), input->size());
+  for (const auto partition : optimizedPartitions) {
+    EXPECT_LT(partition, 8);
+  }
+}
diff --git a/velox/exec/tests/OptimizedPartitionedOutputTest.cpp b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
new file mode 100644
index 00000000000..ed9fa875624
--- /dev/null
+++ b/velox/exec/tests/OptimizedPartitionedOutputTest.cpp
@@ -0,0 +1,1036 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <future>
+#include <random>
+#include <string_view>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "velox/common/base/BitUtil.h"
+#include "velox/common/memory/ByteStream.h"
+#include "velox/exec/HashPartitionFunction.h"
+#include "velox/exec/OptimizedPartitionedOutput.h"
+#include "velox/exec/Task.h"
+#include "velox/exec/tests/utils/OperatorTestBase.h"
+#include "velox/exec/tests/utils/PlanBuilder.h"
+#include "velox/exec/tests/utils/QueryAssertions.h"
+#include "velox/serializers/PrestoSerializer.h"
+#include "velox/serializers/PrestoSerializerSerializationUtils.h"
+
+namespace facebook::velox::exec::test {
+
+namespace {
+
+int64_t simpleColumnPageBytes(
+    std::string_view encodingName,
+    int64_t numRows,
+    int64_t numNulls,
+    int64_t valueWidth) {
+  return serializer::presto::detail::kHeaderSize + // page header
+      4 + // numColumns
+      4 + static_cast<int64_t>(encodingName.size()) + // encoding header
+      4 + // rowCount
+      1 + // null flag
+      (numNulls > 0 ? bits::nbytes(numRows) : 0) + // null bitmap
+      (numRows - numNulls) * valueWidth; // values
+}
+
+} // namespace
+
+/// How null values are distributed in value columns.
+enum class NullMode {
+  kNoNull, // no null values
+  kPartialNull, // row i is null if i % 2 == 0
+  kAllNull, // all values are null
+};
+
+/// Describes one parameterized test configuration.
+struct TestParam {
+  /// Short lowercase name used as the gtest parameter suffix.
+  std::string name;
+  /// Element type for value columns. Ignored when numValueCols == 0.
+  TypePtr valueType;
+  /// Number of partition-key columns (all INTEGER).
+  int numPartitionCols;
+  /// Number of value columns of valueType.
+  int numValueCols;
+  /// Null pattern applied to value columns.
+  NullMode nullMode;
+};
+
+/// Returns the full set of TestParam combinations:
+///   - numValueCols==0: 1 entry per numPartitionCols (type/nullMode irrelevant)
+///   - numValueCols∈{1,256}: all 4 types × 2 pk counts × 3 null modes
+std::vector<TestParam> testParams() {
+  std::vector<TestParam> params;
+
+  const std::vector<std::pair<std::string, TypePtr>> types = {
+      {"bool", BOOLEAN()},
+      {"tinyint", TINYINT()},
+      {"bigint", BIGINT()},
+      {"hugeint", HUGEINT()},
+  };
+
+  const std::vector<std::pair<std::string, NullMode>> nullModes = {
+      {"no_null", NullMode::kNoNull},
+      {"partial_null", NullMode::kPartialNull},
+      {"all_null", NullMode::kAllNull},
+  };
+
+  // Zero value columns: type and null mode do not affect test behavior.
+  for (int numPk : {1, 4}) {
+    params.push_back({
+        .name = "pk" + std::to_string(numPk) + "_val0",
+        .valueType = BIGINT(),
+        .numPartitionCols = numPk,
+        .numValueCols = 0,
+        .nullMode = NullMode::kNoNull,
+    });
+  }
+
+  // One and many value columns: all type × pk-count × null-mode combinations.
+  for (int numVal : {1, 256}) {
+    for (const auto& [typeName, type] : types) {
+      for (int numPk : {1, 4}) {
+        for (const auto& [nullName, nullMode] : nullModes) {
+          params.push_back({
+              .name = "pk" + std::to_string(numPk) + "_val" +
+                  std::to_string(numVal) + "_" + typeName + "_" + nullName,
+              .valueType = type,
+              .numPartitionCols = numPk,
+              .numValueCols = numVal,
+              .nullMode = nullMode,
+          });
+        }
+      }
+    }
+  }
+
+  return params;
+}
+
+/// Collected output from a single run of runPartitionedOutput().
+struct PartitionedOutputResult {
+  // Declared first so it is destroyed last: the IOBufs in pages reference the
+  // task's memory pool, so the task must outlive all the pages.
+  std::shared_ptr<Task> task;
+  /// Serialized output pages per partition, indexed by partition ID.
+  std::vector<std::vector<std::unique_ptr<folly::IOBuf>>> pages;
+  /// Number of pages received by each partition.
+  std::vector<size_t> pageCounts;
+  /// Total rows deserialized from each partition's pages.
+  std::vector<int64_t> rowCounts;
+  /// Number of partitions that received at least one page.
+  int numNonEmptyPartitions{0};
+  /// Sum of operator's numAppends runtime stat.
+  int64_t numAppends{0};
+  /// Sum of operator's numFlushes runtime stat.
+  int64_t numFlushes{0};
+  /// Sum of operator's numBlockedTimes runtime stat.
+  int64_t numBlockedTimes{0};
+};
+
+/// Shared infrastructure for all OptimizedPartitionedOutput tests.
+class OptimizedPartitionedOutputTest : public OperatorTestBase {
+ protected:
+  void SetUp() override {
+    OperatorTestBase::SetUp();
+    bufferManager_->setListenerFactory([]() {
+      return std::make_unique<serializer::presto::PrestoOutputStreamListener>();
+    });
+  }
+
+  std::shared_ptr<core::QueryCtx> createQueryContext(
+      std::unordered_map<std::string, std::string> config) {
+    config[core::QueryConfig::kOptimizedPartitionedOutputEnabled] = "true";
+    return core::QueryCtx::create(
+        executor_.get(), core::QueryConfig(std::move(config)));
+  }
+
+  /// Fetches one batch of serialized pages from the output buffer for the given
+  /// destination. Returns the pages via a promise/future callback.
+  std::vector<std::unique_ptr<folly::IOBuf>>
+  getData(const std::string& taskId, int destination, int64_t sequence) {
+    auto [promise, semiFuture] = folly::makePromiseContract<
+        std::vector<std::unique_ptr<folly::IOBuf>>>();
+    VELOX_CHECK(bufferManager_->getData(
+        taskId,
+        destination,
+        OptimizedPartitionedOutput::kMinDestinationSize,
+        sequence,
+        [result = std::make_shared<
+             folly::Promise<std::vector<std::unique_ptr<folly::IOBuf>>>>(
+             std::move(promise))](
+            std::vector<std::unique_ptr<folly::IOBuf>> pages,
+            int64_t /*sequence*/,
+            std::vector<int64_t> /*remainingBytes*/) {
+          result->setValue(std::move(pages));
+        }));
+    auto future = std::move(semiFuture).via(executor_.get());
+    future.wait(std::chrono::seconds{10});
+    VELOX_CHECK(future.isReady());
+    return std::move(future).value();
+  }
+
+  /// Drains all pages for a destination until the null sentinel is received.
+  std::vector<std::unique_ptr<folly::IOBuf>> getAllData(
+      const std::string& taskId,
+      int destination) {
+    std::vector<std::unique_ptr<folly::IOBuf>> result;
+    int attempts = 0;
+    bool done = false;
+    while (!done) {
+      VELOX_CHECK_LT(++attempts, 10'000);
+      auto pages = getData(taskId, destination, result.size());
+      for (auto& page : pages) {
+        if (page) {
+          result.push_back(std::move(page));
+        } else {
+          bufferManager_->deleteResults(taskId, destination);
+          done = true;
+          break;
+        }
+      }
+    }
+    return result;
+  }
+
+  /// Deserializes a single Presto-serialized IOBuf page into a RowVector.
+  RowVectorPtr deserializePage(
+      const folly::IOBuf* iobuf,
+      const RowTypePtr& rowType) {
+    auto byteRanges = byteRangesFromIOBuf(const_cast<folly::IOBuf*>(iobuf));
+    auto byteStream =
+        std::make_unique<BufferInputStream>(std::move(byteRanges));
+    serializer::presto::PrestoVectorSerde serde;
+    RowVectorPtr result;
+    serde.deserialize(byteStream.get(), pool(), rowType, &result, 0, nullptr);
+    return result;
+  }
+
+  /// Deserializes and concatenates all pages for one partition into a single
+  /// RowVector. Returns an empty RowVector when pages is empty.
+  RowVectorPtr concatPages(
+      const std::vector<std::unique_ptr<folly::IOBuf>>& pages,
+      const RowTypePtr& rowType) {
+    RowVectorPtr result;
+    for (const auto& iobuf : pages) {
+      auto page = deserializePage(iobuf.get(), rowType);
+      if (!result) {
+        result = page;
+      } else {
+        result->append(page.get());
+      }
+    }
+    if (!result) {
+      result = std::static_pointer_cast<RowVector>(
+          BaseVector::create(rowType, 0, pool()));
+    }
+    return result;
+  }
+
+  RowTypePtr outputTypeForLayout(
+      const RowTypePtr& inputType,
+      const std::vector<std::string>& outputLayout) {
+    if (outputLayout.empty()) {
+      return inputType;
+    }
+
+    std::vector<TypePtr> types;
+    types.reserve(outputLayout.size());
+    for (const auto& name : outputLayout) {
+      types.push_back(inputType->findChild(name));
+    }
+    return ROW(outputLayout, std::move(types));
+  }
+
+  RowVectorPtr buildOutput(
+      const RowVectorPtr& input,
+      const std::vector<std::string>& outputLayout) {
+    const auto inputType = asRowType(input->type());
+    const auto outputType = outputTypeForLayout(inputType, outputLayout);
+
+    std::vector<VectorPtr> columns;
+    columns.reserve(outputLayout.size());
+    for (const auto& name : outputLayout) {
+      columns.push_back(input->childAt(inputType->getChildIdx(name)));
+    }
+    return std::make_shared<RowVector>(
+        input->pool(), outputType, nullptr, input->size(), std::move(columns));
+  }
+
+  /// Sorts a vector by value for order-independent comparison. Returns a
+  /// dictionary vector with rows sorted in ascending order.
+  VectorPtr canonicalize(const VectorPtr& vector) {
+    const auto numRows = vector->size();
+    auto indices = makeIndices(numRows, [](auto i) { return i; });
+    auto* data = indices->asMutable<vector_size_t>();
+    std::stable_sort(data, data + numRows, [&](auto a, auto b) {
+      return vector->compare(vector.get(), a, b) < 0;
+    });
+    return BaseVector::wrapInDictionary(nullptr, indices, numRows, vector);
+  }
+
+  /// Builds a RowVector by gathering rows from inputBatches at the given
+  /// (batchIdx, rowIdx) positions. Used to construct the per-partition expected
+  /// RowVector.
+  RowVectorPtr gatherRows(
+      const std::vector<RowVectorPtr>& batches,
+      const std::vector<std::pair<int, int>>& rowList,
+      const RowTypePtr& rowType) {
+    const auto numRows = static_cast<vector_size_t>(rowList.size());
+    auto result = std::static_pointer_cast<RowVector>(
+        BaseVector::create(rowType, numRows, pool()));
+    for (vector_size_t r = 0; r < numRows; ++r) {
+      result->copy(batches[rowList[r].first].get(), r, rowList[r].second, 1);
+    }
+    return result;
+  }
+
+  int64_t getIntRuntimeStat(Task* task, const std::string& statName) {
+    const auto taskStats = task->taskStats();
+    const auto& runtimeStats =
+        taskStats.pipelineStats[0].operatorStats.back().runtimeStats;
+    auto it = runtimeStats.find(statName);
+    return it != runtimeStats.end() ? it->second.sum : 0;
+  }
+
+  /// Builds a plan from inputBatches, creates and starts a task, drains all
+  /// numPartitions destinations concurrently, waits for task completion, and
+  /// returns the collected pages, per-partition row counts, and operator
+  /// runtime stats. extraConfig is merged into the query config on top of the
+  /// OptimizedPartitionedOutput enable flag.
+  PartitionedOutputResult runPartitionedOutput(
+      const std::string& taskId,
+      const std::vector<RowVectorPtr>& inputBatches,
+      const std::vector<std::string>& partitionKeys,
+      int numPartitions,
+      std::unordered_map<std::string, std::string> extraConfig = {},
+      std::chrono::seconds timeout = std::chrono::seconds{30}) {
+    return runPartitionedOutputWithLayout(
+        taskId,
+        inputBatches,
+        partitionKeys,
+        numPartitions,
+        {},
+        std::move(extraConfig),
+        timeout);
+  }
+
+  PartitionedOutputResult runPartitionedOutputWithLayout(
+      const std::string& taskId,
+      const std::vector<RowVectorPtr>& inputBatches,
+      const std::vector<std::string>& partitionKeys,
+      int numPartitions,
+      const std::vector<std::string>& outputLayout,
+      std::unordered_map<std::string, std::string> extraConfig = {},
+      std::chrono::seconds timeout = std::chrono::seconds{30}) {
+    VELOX_CHECK(!inputBatches.empty());
+    const auto rowType =
+        std::dynamic_pointer_cast<const RowType>(inputBatches[0]->type());
+    const auto outputType = outputTypeForLayout(rowType, outputLayout);
+
+    auto plan =
+        PlanBuilder()
+            .values(inputBatches)
+            .partitionedOutput(partitionKeys, numPartitions, outputLayout)
+            .planNode();
+
+    auto task = Task::create(
+        taskId,
+        core::PlanFragment{plan},
+        0,
+        createQueryContext(std::move(extraConfig)),
+        Task::ExecutionMode::kParallel);
+    task->start(1);
+
+    // Drain all partitions concurrently to avoid deadlock with the driver.
+    std::vector<std::future<std::vector<std::unique_ptr<folly::IOBuf>>>>
+        futures;
+    futures.reserve(numPartitions);
+    for (int p = 0; p < numPartitions; ++p) {
+      futures.push_back(std::async(std::launch::async, [&, p] {
+        return getAllData(taskId, p);
+      }));
+    }
+
+    const auto taskWaitUs =
+        std::chrono::duration_cast<std::chrono::microseconds>(timeout).count();
+    EXPECT_TRUE(waitForTaskCompletion(task.get(), taskWaitUs));
+
+    PartitionedOutputResult result;
+    result.pages.resize(numPartitions);
+    result.pageCounts.resize(numPartitions, 0);
+    result.rowCounts.resize(numPartitions, 0);
+
+    for (int p = 0; p < numPartitions; ++p) {
+      result.pages[p] = futures[p].get();
+      result.pageCounts[p] = result.pages[p].size();
+      if (result.pageCounts[p] > 0) {
+        ++result.numNonEmptyPartitions;
+      }
+      result.rowCounts[p] = concatPages(result.pages[p], outputType)->size();
+    }
+
+    result.numAppends = getIntRuntimeStat(task.get(), "numAppends");
+    result.numFlushes = getIntRuntimeStat(task.get(), "numFlushes");
+    result.numBlockedTimes = getIntRuntimeStat(task.get(), "numBlockedTimes");
+    result.task = task;
+
+    return result;
+  }
+
+ private:
+  const std::shared_ptr<OutputBufferManager> bufferManager_{
+      OutputBufferManager::getInstanceRef()};
+};
+
+// ─── Parameterized fixture ───────────────────────────────────────────────────
+
+/// Parameterized fixture that exercises every TestParam combination.
+class OptimizedPartitionedOutputParamTest
+    : public OptimizedPartitionedOutputTest,
+      public ::testing::WithParamInterface<TestParam> {
+ protected:
+  const TestParam& param() const {
+    return GetParam();
+  }
+
+  /// Names for pk columns: ["p1"] or ["p1","p2","p3","p4"].
+  std::vector<std::string> pkColNames() const {
+    std::vector<std::string> names;
+    for (int i = 0; i < param().numPartitionCols; ++i) {
+      names.push_back("p" + std::to_string(i + 1));
+    }
+    return names;
+  }
+
+  /// Names for value columns: ["v0", ..., "v{N-1}"].
+  std::vector<std::string> valueColNames() const {
+    std::vector<std::string> names;
+    for (int i = 0; i < param().numValueCols; ++i) {
+      names.push_back("v" + std::to_string(i));
+    }
+    return names;
+  }
+
+  /// Full input ROW type: pk cols (INTEGER) followed by value cols.
+  RowTypePtr inputType() const {
+    std::vector<std::string> names = pkColNames();
+    std::vector<TypePtr> types(param().numPartitionCols, INTEGER());
+    for (const auto& name : valueColNames()) {
+      names.push_back(name);
+      types.push_back(param().valueType);
+    }
+    return ROW(std::move(names), std::move(types));
+  }
+
+  /// Channel indices of the pk columns within the input type.
+  std::vector<column_index_t> pkChannels() const {
+    std::vector<column_index_t> channels(param().numPartitionCols);
+    std::iota(channels.begin(), channels.end(), 0);
+    return channels;
+  }
+
+  /// Returns true if row i should be null in value columns for the current
+  /// null mode.
+  bool isNull(int rowIdx) const {
+    switch (param().nullMode) {
+      case NullMode::kNoNull:
+        return false;
+      case NullMode::kAllNull:
+        return true;
+      case NullMode::kPartialNull:
+        return rowIdx % 2 == 0;
+    }
+    VELOX_UNREACHABLE();
+  }
+
+  /// Creates a flat vector of the param's value type with random values and
+  /// nulls applied according to nullMode.
+  VectorPtr makeRandomValueVector(int numRows, std::mt19937_64& rng) {
+    auto isNullFn = [this](vector_size_t i) -> bool { return isNull(i); };
+
+    switch (param().valueType->kind()) {
+      case TypeKind::BOOLEAN:
+        return vectorMaker_.flatVector<bool>(
+            numRows,
+            [&](auto /*i*/) -> bool { return rng() % 2 == 0; },
+            isNullFn);
+      case TypeKind::TINYINT:
+        return vectorMaker_.flatVector<int8_t>(
+            numRows,
+            [&](auto /*i*/) -> int8_t { return static_cast<int8_t>(rng()); },
+            isNullFn);
+      case TypeKind::BIGINT:
+        return vectorMaker_.flatVector<int64_t>(
+            numRows,
+            [&](auto /*i*/) -> int64_t { return static_cast<int64_t>(rng()); },
+            isNullFn);
+      case TypeKind::HUGEINT:
+        return vectorMaker_.flatVector<int128_t>(
+            numRows,
+            [&](auto /*i*/) -> int128_t {
+              int64_t hi = static_cast<int64_t>(rng());
+              uint64_t lo = rng();
+              return (static_cast<int128_t>(hi) << 64) |
+                  static_cast<int128_t>(lo);
+            },
+            isNullFn);
+      default:
+        VELOX_UNREACHABLE(
+            "Unsupported value type: {}", param().valueType->toString());
+    }
+  }
+
+  /// Builds one input RowVector. p0Values holds the first pk column; each
+  /// subsequent pk column i is p0 + i. Value columns are filled with
+  /// independent random data drawn from rng.
+  RowVectorPtr makeInputBatch(
+      const std::vector<int32_t>& p0Values,
+      std::mt19937_64& rng) {
+    const int numRows = p0Values.size();
+    std::vector<std::string> names;
+    std::vector<VectorPtr> vecs;
+
+    // pk columns
+    for (int k = 0; k < param().numPartitionCols; ++k) {
+      names.push_back("p" + std::to_string(k + 1));
+      vecs.push_back(vectorMaker_.flatVector<int32_t>(
+          numRows, [&, k](auto i) { return p0Values[i] + k; }));
+    }
+
+    // value columns
+    for (int v = 0; v < param().numValueCols; ++v) {
+      names.push_back("v" + std::to_string(v));
+      vecs.push_back(makeRandomValueVector(numRows, rng));
+    }
+
+    return makeRowVector(names, vecs);
+  }
+
+  /// Verifies that the deserialized pages for each partition exactly match the
+  /// rows from inputBatches that were routed to that partition. Both expected
+  /// and actual rows are sorted (canonicalized) before comparison to allow
+  /// order-independent matching.
+  void verifyDataIntegrity(
+      const std::vector<RowVectorPtr>& inputBatches,
+      const std::vector<std::vector<std::unique_ptr<folly::IOBuf>>>& allPages,
+      int numPartitions) {
+    // Compute expected per-partition row list using the same hash function as
+    // the operator.
+    auto partitionFn = std::make_unique<HashPartitionFunction>(
+        false, numPartitions, inputType(), pkChannels());
+
+    std::vector<std::vector<std::pair<int, int>>> expectedRows(numPartitions);
+    for (int batchIdx = 0; batchIdx < static_cast<int>(inputBatches.size());
+         ++batchIdx) {
+      std::vector<uint32_t> assignments(inputBatches[batchIdx]->size());
+      partitionFn->partition(*inputBatches[batchIdx], assignments);
+      for (int rowIdx = 0; rowIdx < static_cast<int>(assignments.size());
+           ++rowIdx) {
+        expectedRows[assignments[rowIdx]].emplace_back(batchIdx, rowIdx);
+      }
+    }
+
+    const auto rowType = inputType();
+    int64_t totalRows = 0;
+
+    for (int p = 0; p < numPartitions; ++p) {
+      auto expected = gatherRows(inputBatches, expectedRows[p], rowType);
+      auto actual = concatPages(allPages[p], rowType);
+
+      totalRows += expected->size();
+      ASSERT_EQ(expected->size(), actual->size())
+          << "partition " << p << " row count mismatch";
+
+      // Sort both vectors before comparing to allow order-independent matching.
+      auto expectedSorted = canonicalize(expected);
+      auto actualSorted = canonicalize(actual);
+      velox::test::assertEqualVectors(expectedSorted, actualSorted);
+    }
+
+    int64_t sentRows = 0;
+    for (const auto& batch : inputBatches) {
+      sentRows += batch->size();
+    }
+    EXPECT_EQ(totalRows, sentRows);
+  }
+};
+
+// ─── singleFlush ─────────────────────────────────────────────────────────────
+
+// Sends one batch into a large-buffer operator. All data is buffered without
+// triggering an intermediate flush; the final noMoreInput flush serializes
+// everything once. Verifies numFlushes==1, numBlockedTimes==0, and that every
+// deserialized row matches its source.
+TEST_P(OptimizedPartitionedOutputParamTest, singleFlush) {
+  constexpr int kNumPartitions = 4;
+  // One row per partition key, so every partition gets data.
+  std::vector<int32_t> p0Values;
+  for (int i = 0; i < kNumPartitions; ++i) {
+    p0Values.push_back(i);
+  }
+
+  std::mt19937_64 rng(42);
+  const std::vector<RowVectorPtr> inputBatches = {
+      makeInputBatch(p0Values, rng)};
+
+  auto result = runPartitionedOutput(
+      "local://test-single-flush-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+  EXPECT_EQ(result.numAppends, 1);
+  EXPECT_EQ(result.numFlushes, 1);
+  EXPECT_EQ(result.numBlockedTimes, 0);
+}
+
+// ─── multipleFlushes ─────────────────────────────────────────────────────────
+
+// Sends multiple batches through a 1-byte serializer ceiling so each addInput
+// triggers its own flush. A 10-byte OutputBuffer ceiling forces blocking.
+// Concurrent consumers drain each partition so the driver can unblock.
+// Verifies numFlushes==kBatches, numBlockedTimes>=1, and full data integrity.
+TEST_P(OptimizedPartitionedOutputParamTest, multipleFlushes) {
+  constexpr int kNumPartitions = 4;
+  constexpr int kBatches = 10;
+
+  // For wide schemas, reduce rows per batch so each batch stays small.
+  const int kRowsPerBatch = param().numValueCols >= 64 ? 2 : kNumPartitions;
+
+  std::vector<int32_t> p0Values(kRowsPerBatch);
+  for (int i = 0; i < kRowsPerBatch; ++i) {
+    p0Values[i] = i % kNumPartitions;
+  }
+  std::mt19937_64 rng(42);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-multiple-flushes-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions,
+      // 1-byte serializer ceiling flushes before every addInput.
+      // 10-byte OutputBuffer ceiling forces blocking on every enqueue.
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize, "1"},
+       {core::QueryConfig::kMaxOutputBufferSize, "10"}},
+      std::chrono::seconds{30});
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+  EXPECT_EQ(result.numAppends, kBatches);
+  EXPECT_EQ(result.numFlushes, kBatches);
+  EXPECT_EQ(result.numBlockedTimes, kBatches);
+}
+
+// ─── uniformDistribution ─────────────────────────────────────────────────────
+
+// Sends many batches with p1 cycling through all partition keys so every
+// partition receives rows. Uses the default buffer size (no intermediate
+// flush). Verifies that all partitions are non-empty and that data integrity
+// holds across all rows.
+TEST_P(OptimizedPartitionedOutputParamTest, uniformDistribution) {
+  constexpr int kNumPartitions = 4;
+  constexpr int kBatches = 10;
+
+  std::mt19937_64 rng(123);
+  // Use enough distinct p1 values across a wide range so all partitions receive
+  // rows regardless of how the hash distributes them. With 50 distinct p1
+  // values and 4 partitions the probability of any partition being empty is <
+  // 1e-6.
+  constexpr int kRowsPerBatch = 50;
+  std::uniform_int_distribution<int32_t> dist(0, 999);
+
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    std::vector<int32_t> p0Values(kRowsPerBatch);
+    for (auto& v : p0Values) {
+      v = dist(rng);
+    }
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-uniform-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // With 50 distinct p1 values per batch and 4 partitions, every partition must
+  // receive rows (probability of any bucket being empty is < 1e-6).
+  EXPECT_EQ(result.numNonEmptyPartitions, kNumPartitions);
+}
+
+// ─── skewed distributions
+// ──────────────────────────────────────────────────────
+
+// Sends batches with 6 distinct key values whose frequencies decrease by
+// roughly 2x per step, so non-empty partitions end up with very different row
+// counts. Because 6 < 8 some partitions stay empty; because 6 > 8/2 most
+// partitions receive rows. This sits between uniformDistribution (all full)
+// and skewedDistribution (at most 2 of 64 filled).
+TEST_P(OptimizedPartitionedOutputParamTest, moderateSkew) {
+  constexpr int kNumPartitions = 8;
+  constexpr int kBatches = 5;
+
+  // Key i appears 2^(5-i) times per batch: key 0 → 32 rows, key 1 → 16,
+  // key 2 → 8, key 3 → 4, key 4 → 2, key 5 → 1. Total: 63 rows per batch.
+  std::vector<int32_t> keyPattern;
+  for (int key = 0; key < 6; ++key) {
+    const int count = 1 << (5 - key); // 32, 16, 8, 4, 2, 1
+    for (int j = 0; j < count; ++j) {
+      keyPattern.push_back(key);
+    }
+  }
+
+  std::mt19937_64 rng(55);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    auto p0Values = keyPattern;
+    std::shuffle(p0Values.begin(), p0Values.end(), rng);
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-moderate-skew-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // 6 distinct keys → at most 6 non-empty partitions; 6 < 8 → at least one
+  // empty partition.
+  EXPECT_LE(result.numNonEmptyPartitions, 6);
+
+  // Verify a wide spread in per-partition row counts: the heaviest non-empty
+  // partition must have at least 2x the average non-empty partition size.
+  // This remains stable even when several low-frequency keys hash to the same
+  // bucket, unlike a comparison against the minimum non-empty partition.
+  int64_t maxRows = 0;
+  int64_t totalNonZeroRows = 0;
+  int64_t numNonZeroPartitions = 0;
+  for (int p = 0; p < kNumPartitions; ++p) {
+    if (result.rowCounts[p] > 0) {
+      maxRows = std::max(maxRows, result.rowCounts[p]);
+      totalNonZeroRows += result.rowCounts[p];
+      ++numNonZeroPartitions;
+    }
+  }
+  ASSERT_GT(numNonZeroPartitions, 0);
+  EXPECT_GE(maxRows * numNonZeroPartitions, totalNonZeroRows * 2);
+}
+
+// Sends many batches with p1 restricted to {0, 1} into a 64-partition
+// operator. At most 2 of the 64 partitions will receive any rows; the rest
+// must be empty. Verifies data integrity and the empty-partition invariant.
+TEST_P(OptimizedPartitionedOutputParamTest, twoDestinations) {
+  constexpr int kNumPartitions = 64;
+  constexpr int kBatches = 10;
+  constexpr int kRowsPerBatch = 4;
+
+  std::mt19937_64 rng(7);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    // p1 only takes values 0 and 1; at most 2 of 64 partitions receive rows.
+    std::vector<int32_t> p0Values(kRowsPerBatch);
+    for (int i = 0; i < kRowsPerBatch; ++i) {
+      p0Values[i] = i % 2;
+    }
+    inputBatches.push_back(makeInputBatch(p0Values, rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-skewed-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // p1 ∈ {0, 1}: at most 2 distinct hash buckets receive rows.
+  EXPECT_LE(result.numNonEmptyPartitions, 2);
+  EXPECT_GE(result.numNonEmptyPartitions, 1);
+}
+
+// Sends multiple batches where every row carries the same partition key value
+// so all rows hash to a single destination. Verifies that exactly one partition
+// receives all rows and the remaining partitions stay empty.
+TEST_P(OptimizedPartitionedOutputParamTest, singleDestination) {
+  constexpr int kNumPartitions = 8;
+  constexpr int kBatches = 5;
+  constexpr int kRowsPerBatch = 10;
+
+  // Every row has p1=0 (p2=1, p3=2, p4=3 for multi-pk params), so the hash is
+  // identical for every row and all rows land in one partition.
+  std::mt19937_64 rng(99);
+  std::vector<RowVectorPtr> inputBatches;
+  inputBatches.reserve(kBatches);
+  for (int b = 0; b < kBatches; ++b) {
+    inputBatches.push_back(
+        makeInputBatch(std::vector<int32_t>(kRowsPerBatch, 0), rng));
+  }
+
+  auto result = runPartitionedOutput(
+      "local://test-single-dest-" + param().name,
+      inputBatches,
+      pkColNames(),
+      kNumPartitions);
+
+  verifyDataIntegrity(inputBatches, result.pages, kNumPartitions);
+
+  // All rows must land in exactly one partition.
+  EXPECT_EQ(result.numNonEmptyPartitions, 1);
+
+  // That one partition must hold every row from every batch.
+  const int64_t totalInputRows = static_cast<int64_t>(kBatches) * kRowsPerBatch;
+  for (int p = 0; p < kNumPartitions; ++p) {
+    if (result.rowCounts[p] > 0) {
+      EXPECT_EQ(result.rowCounts[p], totalInputRows) << "partition " << p;
+    }
+  }
+}
+
+// ─── instantiation ───────────────────────────────────────────────────────────
+
+INSTANTIATE_TEST_SUITE_P(
+    Params,
+    OptimizedPartitionedOutputParamTest,
+    ::testing::ValuesIn(testParams()),
+    [](const ::testing::TestParamInfo<TestParam>& info) {
+      return info.param.name;
+    });
+
+// ─── non-parameterized tests ─────────────────────────────────────────────────
+
+// In single-partition case, if the second addInput() is estimated to stay
+// below the partitioned-output limit, it doesn't flush before appending.
+TEST_F(OptimizedPartitionedOutputTest, noPreFlushWhenEstimateBelowLimit) {
+  auto rowType = ROW({"v"}, {BIGINT()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}),
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({20})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-below-limit",
+      inputBatches,
+      {},
+      1,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(twoRowPageBytes + 1)}});
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 1);
+
+  auto expected = makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 20})});
+  auto actual = concatPages(result.pages[0], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// In single-partition case, if the second addInput() is estimated to land
+// exactly on the partitioned-output limit, it doesn't flush before appending.
+TEST_F(OptimizedPartitionedOutputTest, noPreFlushWhenEstimateAtLimit) {
+  auto rowType = ROW({"v"}, {BIGINT()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}),
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({20})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-equals-limit",
+      inputBatches,
+      {},
+      1,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(twoRowPageBytes)}});
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 1);
+
+  auto expected = makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 20})});
+  auto actual = concatPages(result.pages[0], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// In the single-partition case, if the second addInput() is estimated to
+// exceed the partitioned-output limit, addInput() flushes before appending.
+TEST_F(OptimizedPartitionedOutputTest, preFlushWhenEstimateExceedsLimit) {
+  auto rowType = ROW({"v"}, {BIGINT()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}),
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({20})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 2, 0, 8);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-exceeds-limit",
+      inputBatches,
+      {},
+      1,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(twoRowPageBytes - 1)}});
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 2);
+
+  auto expected = makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 20})});
+  auto actual = concatPages(result.pages[0], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// In multi-partition case, estimateBytesAfterAppend() may conservatively
+// assume an input could go to the last empty partition even when every row
+// actually goes to an existing partition, causing a pre-flush.
+TEST_F(
+    OptimizedPartitionedOutputTest,
+    preFlushWhenConservativeEstimateExceedsLimit) {
+  auto rowType = ROW({"p1"}, {INTEGER()});
+  std::vector<RowVectorPtr> inputBatches = {
+      makeRowVector({"p1"}, {makeFlatVector<int32_t>({5})}),
+      makeRowVector({"p1"}, {makeFlatVector<int32_t>({5})})};
+
+  const auto twoRowPageBytes = simpleColumnPageBytes("INT_ARRAY", 2, 0, 4);
+  auto result = runPartitionedOutput(
+      "local://test-buffer-conservative-exceeds-limit",
+      inputBatches,
+      {"p1"},
+      2,
+      {{core::QueryConfig::kMaxPartitionedOutputBufferSize,
+        std::to_string(
+            twoRowPageBytes)}}); // exact append fits; estimate does not
+
+  EXPECT_EQ(result.numAppends, 2);
+  EXPECT_EQ(result.numFlushes, 2);
+  EXPECT_EQ(result.numNonEmptyPartitions, 1);
+
+  EXPECT_THAT(result.pageCounts, testing::UnorderedElementsAre(2, 0));
+  EXPECT_THAT(result.rowCounts, testing::UnorderedElementsAre(2, 0));
+
+  const auto nonEmptyPartition = result.rowCounts[0] > 0 ? 0 : 1;
+
+  auto expected = makeRowVector({"p1"}, {makeFlatVector<int32_t>({5, 5})});
+  auto actual = concatPages(result.pages[nonEmptyPartition], rowType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+// Verifies that replicateNullsAndAny raises an error since it is not yet
+// supported by OptimizedPartitionedOutput.
+TEST_F(OptimizedPartitionedOutputTest, replicateNullsAndAnyUnsupported) {
+  auto input = makeRowVector(
+      {"p1", "v1"},
+      {makeNullableFlatVector<int32_t>({0, std::nullopt, 1}),
+       makeFlatVector<std::string>({"a", "b", "c"})});
+
+  auto plan =
+      PlanBuilder()
+          .values({input})
+          .partitionedOutput({"p1"}, 2, /*replicateNullsAndAny=*/true, {"v1"})
+          .planNode();
+
+  auto taskId = "local://test-replicate-nulls-unsupported-0";
+  auto task = Task::create(
+      taskId,
+      core::PlanFragment{plan},
+      0,
+      createQueryContext({}),
+      Task::ExecutionMode::kParallel);
+  task->start(1);
+
+  const auto taskWaitUs = std::chrono::duration_cast<std::chrono::microseconds>(
+                              std::chrono::seconds{10})
+                              .count();
+  ASSERT_TRUE(waitForTaskFailure(task.get(), taskWaitUs));
+  ASSERT_THAT(
+      task->errorMessage(),
+      testing::HasSubstr(
+          "replicateNullsAndAny is not yet supported by OptimizedPartitionedOutput"));
+}
+
+TEST_F(OptimizedPartitionedOutputTest, outputLayout) {
+  auto input = makeRowVector(
+      {"p1", "v1", "v2", "unused"},
+      {makeFlatVector<int32_t>({0, 1, 2, 3, 4, 5, 6, 7}),
+       makeFlatVector<int64_t>({10, 11, 12, 13, 14, 15, 16, 17}),
+       makeFlatVector<int8_t>({20, 21, 22, 23, 24, 25, 26, 27}),
+       makeFlatVector<int64_t>({30, 31, 32, 33, 34, 35, 36, 37})});
+  auto inputCopy =
+      std::static_pointer_cast<RowVector>(BaseVector::copy(*input, pool()));
+
+  const std::vector<std::string> outputLayout = {"v2", "v1"};
+  const auto inputType = asRowType(input->type());
+  const auto outputType = outputTypeForLayout(inputType, outputLayout);
+  auto expected = buildOutput(inputCopy, outputLayout);
+
+  auto result = runPartitionedOutputWithLayout(
+      "local://test-optimized-output-layout", {input}, {}, 1, outputLayout);
+
+  auto actual = concatPages(result.pages[0], outputType);
+  velox::test::assertEqualVectors(expected, actual);
+}
+
+TEST_F(OptimizedPartitionedOutputTest, duplicateOutputColumns) {
+  constexpr int kNumPartitions = 4;
+  auto input = makeRowVector(
+      {"p1", "v1"},
+      {makeFlatVector<int32_t>({0, 1, 2, 3, 0, 1, 2, 3}),
+       makeFlatVector<int64_t>({10, 11, 12, 13, 14, 15, 16, 17})});
+  auto inputCopy =
+      std::static_pointer_cast<RowVector>(BaseVector::copy(*input, pool()));
+  const std::vector<std::string> outputLayout = {"v1", "v1"};
+  const auto inputType = asRowType(input->type());
+  const auto outputType = outputTypeForLayout(inputType, outputLayout);
+  auto output = buildOutput(inputCopy, outputLayout);
+
+  auto result = runPartitionedOutputWithLayout(
+      "local://test-optimized-output-layout-duplicated-columns",
+      {input},
+      {"p1"},
+      kNumPartitions,
+      outputLayout);
+
+  std::vector<uint32_t> assignments(inputCopy->size());
+  auto partitionFn = std::make_unique<HashPartitionFunction>(
+      false, kNumPartitions, inputType, std::vector<column_index_t>{0});
+  partitionFn->partition(*inputCopy, assignments);
+
+  std::vector<std::vector<std::pair<int, int>>> expectedRows(kNumPartitions);
+  for (vector_size_t i = 0; i < assignments.size(); ++i) {
+    expectedRows[assignments[i]].emplace_back(0, i);
+  }
+
+  for (int p = 0; p < kNumPartitions; ++p) {
+    auto expected = gatherRows({output}, expectedRows[p], outputType);
+    auto actual = concatPages(result.pages[p], outputType);
+    ASSERT_EQ(expected->size(), actual->size()) << "partition " << p;
+    velox::test::assertEqualVectors(
+        canonicalize(expected), canonicalize(actual));
+  }
+}
+
+} // namespace facebook::velox::exec::test
diff --git a/velox/exec/tests/OptimizedVectorHasherTest.cpp b/velox/exec/tests/OptimizedVectorHasherTest.cpp
new file mode 100644
index 00000000000..e0a107b6fd4
--- /dev/null
+++ b/velox/exec/tests/OptimizedVectorHasherTest.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <gtest/gtest.h>
+
+#include "velox/common/base/tests/GTestUtils.h"
+#include "velox/exec/OptimizedVectorHasher.h"
+#include "velox/exec/VectorHasher.h"
+#include "velox/type/tests/utils/CustomTypesForTesting.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook;
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::test;
+
+namespace {
+
+class OptimizedVectorHasherTest : public testing::Test, public VectorTestBase {
+ protected:
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+  }
+
+  BufferPtr makeIndices(
+      vector_size_t size,
+      std::function<vector_size_t(vector_size_t)> indexAt) {
+    auto indices = AlignedBuffer::allocate<vector_size_t>(size, pool());
+    auto rawIndices = indices->asMutable<vector_size_t>();
+    for (vector_size_t i = 0; i < size; ++i) {
+      rawIndices[i] = indexAt(i);
+    }
+    return indices;
+  }
+
+  static SelectivityVector makeOddRows(vector_size_t size) {
+    SelectivityVector oddRows(size);
+    for (vector_size_t i = 0; i < size; i += 2) {
+      oddRows.setValid(i, false);
+    }
+    oddRows.updateBounds();
+    return oddRows;
+  }
+
+  void compareHashes(
+      const TypePtr& type,
+      const VectorPtr& vector,
+      const SelectivityVector& rows,
+      bool mix,
+      uint64_t seed = 0) {
+    auto expectedHasher = VectorHasher::create(type, 0);
+    auto actualHasher = OptimizedVectorHasher::create(type, 0);
+
+    raw_vector<uint64_t> expected(vector->size(), pool());
+    raw_vector<uint64_t> actual(vector->size(), pool());
+    if (mix) {
+      std::iota(expected.begin(), expected.end(), seed);
+      std::iota(actual.begin(), actual.end(), seed);
+    } else {
+      std::fill(expected.begin(), expected.end(), 0);
+      std::fill(actual.begin(), actual.end(), 0);
+    }
+
+    expectedHasher->decode(*vector, rows);
+    actualHasher->decode(*vector, rows);
+
+    expectedHasher->hash(rows, mix, expected);
+    actualHasher->hash(rows, mix, actual);
+
+    for (vector_size_t i = 0; i < vector->size(); ++i) {
+      EXPECT_EQ(expected[i], actual[i]) << "at " << i;
+    }
+  }
+
+  void comparePrecomputed(
+      const TypePtr& type,
+      const VectorPtr& value,
+      vector_size_t size,
+      bool mix,
+      uint64_t seed = 0) {
+    auto expectedHasher = VectorHasher::create(type, 0);
+    auto actualHasher = OptimizedVectorHasher::create(type, 0);
+
+    raw_vector<uint64_t> expected(size, pool());
+    raw_vector<uint64_t> actual(size, pool());
+    if (mix) {
+      std::iota(expected.begin(), expected.end(), seed);
+      std::iota(actual.begin(), actual.end(), seed);
+    } else {
+      std::fill(expected.begin(), expected.end(), 0);
+      std::fill(actual.begin(), actual.end(), 0);
+    }
+
+    const SelectivityVector rows(size);
+    expectedHasher->precompute(*value);
+    actualHasher->precompute(*value);
+
+    expectedHasher->hashPrecomputed(rows, mix, expected);
+    actualHasher->hashPrecomputed(mix, actual);
+
+    for (vector_size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(expected[i], actual[i]) << "at " << i;
+    }
+  }
+};
+
+TEST_F(OptimizedVectorHasherTest, flat) {
+  auto vector = BaseVector::create(BIGINT(), 100, pool());
+  auto flatVector = vector->asFlatVector<int64_t>();
+  for (vector_size_t i = 0; i < 100; ++i) {
+    if (i % 5 == 0) {
+      flatVector->setNull(i, true);
+    } else {
+      flatVector->set(i, i);
+    }
+  }
+
+  const SelectivityVector allRows(100);
+  const auto oddRows = makeOddRows(100);
+
+  compareHashes(BIGINT(), vector, oddRows, false);
+  compareHashes(BIGINT(), vector, allRows, false);
+  compareHashes(BIGINT(), vector, allRows, true, 10);
+
+  flatVector->setNull(0, true);
+  comparePrecomputed(BIGINT(), vector, 100, false);
+
+  flatVector->setNull(0, false);
+  flatVector->set(0, 7);
+  comparePrecomputed(BIGINT(), vector, 100, false);
+
+  flatVector->set(0, 55);
+  comparePrecomputed(BIGINT(), vector, 100, true, 20);
+}
+
+TEST_F(OptimizedVectorHasherTest, boolFlat) {
+  constexpr vector_size_t kSize = 137;
+  auto vector = makeFlatVector<bool>(
+      kSize,
+      [](vector_size_t row) { return row % 7 == 0 || row % 11 == 3; },
+      [](vector_size_t row) { return row % 13 == 5; });
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(BOOLEAN(), vector, oddRows, false);
+  compareHashes(BOOLEAN(), vector, allRows, false);
+  compareHashes(BOOLEAN(), vector, allRows, true, 17);
+
+  vector = makeFlatVector<bool>(
+      kSize, [](vector_size_t row) { return row % 5 < 2; });
+  compareHashes(BOOLEAN(), vector, allRows, false);
+  compareHashes(BOOLEAN(), vector, allRows, true, 23);
+}
+
+TEST_F(OptimizedVectorHasherTest, nans) {
+  static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
+  static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();
+  auto vector = makeFlatVector<double>({1.0, -1.0, kNaN, kSNaN, 0.0, -0.0});
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(DOUBLE(), vector, allRows, false);
+  compareHashes(DOUBLE(), vector, allRows, true, 15);
+}
+
+TEST_F(OptimizedVectorHasherTest, nonNullConstant) {
+  auto vector = BaseVector::createConstant(INTEGER(), 123, 6, pool());
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(INTEGER(), vector, oddRows, false);
+  compareHashes(INTEGER(), vector, allRows, false);
+  compareHashes(INTEGER(), vector, allRows, true, 7);
+}
+
+TEST_F(OptimizedVectorHasherTest, nullConstant) {
+  auto vector = BaseVector::createNullConstant(INTEGER(), 6, pool());
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(INTEGER(), vector, oddRows, false);
+  compareHashes(INTEGER(), vector, allRows, false);
+  compareHashes(INTEGER(), vector, allRows, true, 11);
+}
+
+TEST_F(OptimizedVectorHasherTest, unknown) {
+  auto vector = makeAllNullFlatVector<UnknownValue>(100);
+  const SelectivityVector allRows(vector->size());
+  const auto oddRows = makeOddRows(vector->size());
+
+  compareHashes(UNKNOWN(), vector, oddRows, false);
+  compareHashes(UNKNOWN(), vector, allRows, false);
+  compareHashes(UNKNOWN(), vector, allRows, true, 0);
+}
+
+TEST_F(OptimizedVectorHasherTest, dictionary) {
+  auto base = makeNullableFlatVector<int64_t>({10, 20, std::nullopt, 40, 50});
+  constexpr vector_size_t kSize = 100;
+  auto dictionary = BaseVector::wrapInDictionary(
+      makeNulls(kSize, [&](vector_size_t row) { return row == 1 || row == 7; }),
+      makeIndices(kSize, [&](vector_size_t row) { return row % base->size(); }),
+      kSize,
+      base);
+  const SelectivityVector allRows(dictionary->size());
+  const auto oddRows = makeOddRows(dictionary->size());
+
+  compareHashes(BIGINT(), dictionary, oddRows, false);
+  compareHashes(BIGINT(), dictionary, allRows, false);
+  compareHashes(BIGINT(), dictionary, allRows, true, 10);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparison) {
+  auto vector = makeNullableFlatVector<int64_t>(
+      {0, 1, 256, 257, std::nullopt, 512, 513},
+      BIGINT_TYPE_WITH_CUSTOM_COMPARISON());
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), vector, allRows, false);
+  compareHashes(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(), vector, allRows, true, 9);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparisonArray) {
+  auto vector = makeNullableArrayVector<int64_t>(
+      {{0, 1, 2},
+       {256, 257, 258},
+       {512, 513, 514},
+       {3, 4, 5},
+       {259, 260, 261},
+       {515, 516, 517},
+       {std::nullopt}},
+      ARRAY(BIGINT_TYPE_WITH_CUSTOM_COMPARISON()));
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(
+      ARRAY(BIGINT_TYPE_WITH_CUSTOM_COMPARISON()), vector, allRows, false);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparisonMap) {
+  auto vector = makeNullableMapVector<int64_t, int64_t>(
+      {std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {0, 10}, {1, 11}, {2, 12}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {256, 266}, {257, 267}, {258, 268}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {512, 522}, {513, 523}, {514, 524}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {3, 103}, {4, 104}, {5, 105}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {259, 359}, {260, 360}, {261, 361}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {515, 615}, {516, 616}, {517, 617}},
+       std::vector<std::pair<int64_t, std::optional<int64_t>>>{
+           {0, std::nullopt}}},
+      MAP(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(),
+          BIGINT_TYPE_WITH_CUSTOM_COMPARISON()));
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(
+      MAP(BIGINT_TYPE_WITH_CUSTOM_COMPARISON(),
+          BIGINT_TYPE_WITH_CUSTOM_COMPARISON()),
+      vector,
+      allRows,
+      false);
+}
+
+TEST_F(OptimizedVectorHasherTest, customComparisonRow) {
+  auto vector = makeRowVector(
+      {"a"},
+      {makeNullableFlatVector<int64_t>(
+          {std::nullopt, 0, 1, 256, 257, 512, 513},
+          BIGINT_TYPE_WITH_CUSTOM_COMPARISON())});
+  const SelectivityVector allRows(vector->size());
+
+  compareHashes(vector->type(), vector, allRows, false);
+}
+
+TEST_F(OptimizedVectorHasherTest, precompute) {
+  auto value = makeNullableFlatVector<int64_t>({std::nullopt});
+  comparePrecomputed(BIGINT(), value, 100, false);
+
+  value = makeNullableFlatVector<int64_t>({7});
+  comparePrecomputed(BIGINT(), value, 100, false);
+
+  value = makeNullableFlatVector<int64_t>({55});
+  comparePrecomputed(BIGINT(), value, 100, true, 100);
+}
+
+TEST_F(OptimizedVectorHasherTest, typeMismatch) {
+  auto hasher = OptimizedVectorHasher::create(BIGINT(), 0);
+  auto vector = makeFlatVector<StringView>({"a", "b", "c"});
+  SelectivityVector rows(vector->size());
+
+  VELOX_ASSERT_THROW(
+      hasher->decode(*vector, rows), "Type mismatch: BIGINT vs. VARCHAR");
+}
+
+} // namespace
diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp
index 9117cba55ee..468ae20bf79 100644
--- a/velox/exec/tests/utils/PlanBuilder.cpp
+++ b/velox/exec/tests/utils/PlanBuilder.cpp
@@ -1689,7 +1689,8 @@ class RoundRobinRowPartitionFunctionSpec : public core::PartitionFunctionSpec {
  public:
   std::unique_ptr<core::PartitionFunction> create(
       int numPartitions,
-      bool /*localExchange*/) const override {
+      bool /*localExchange*/,
+      bool /*useOptimizedPartitionFunction*/ = false) const override {
     return std::make_unique<RoundRobinRowPartitionFunction>(numPartitions);
   }
 
diff --git a/velox/flag_definitions/flags.cpp b/velox/flag_definitions/flags.cpp
index 8648e80a68e..4adc6a5a22c 100644
--- a/velox/flag_definitions/flags.cpp
+++ b/velox/flag_definitions/flags.cpp
@@ -55,6 +55,8 @@ DEFINE_int32(
 
 DEFINE_bool(avx2, true, "Enables use of AVX2 when available");
 
+DEFINE_bool(avx512f, true, "Enables use of AVX512F when available");
+
 DEFINE_bool(bmi2, true, "Enables use of BMI2 when available");
 
 // Used in exec/Expr.cpp
diff --git a/velox/serializers/CMakeLists.txt b/velox/serializers/CMakeLists.txt
index c5227f763ff..366b043aeb1 100644
--- a/velox/serializers/CMakeLists.txt
+++ b/velox/serializers/CMakeLists.txt
@@ -29,6 +29,7 @@ velox_add_library(
   UnsafeRowSerializer.cpp
   PrestoBatchVectorSerializer.cpp
   PrestoHeader.cpp
+  PrestoIterativePartitioningSerializer.cpp
   PrestoIterativeVectorSerializer.cpp
   PrestoSerializerDeserializationUtils.cpp
   PrestoSerializerEstimationUtils.cpp
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.cpp b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
new file mode 100644
index 00000000000..533b8d6bb75
--- /dev/null
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+
+#include <algorithm>
+#include <optional>
+
+#include "velox/common/base/BitUtil.h"
+#include "velox/type/Type.h"
+#include "velox/vector/ComplexVector.h"
+#include "velox/vector/ConstantVector.h"
+#include "velox/vector/FlatVector.h"
+
+namespace facebook::velox::serializer::presto {
+
+namespace {
+
+constexpr int8_t kCheckSumBitMask = 4;
+constexpr int64_t kVectorSizeTypeSize{sizeof(vector_size_t)};
+// [numRows:4][codec:1]
+constexpr int64_t kUncompressedSizeOffset{kVectorSizeTypeSize + 1};
+// [numRows:4][codec:1][uncompressedSize:4][compressedSize:4][checksum:8]
+constexpr int64_t kHeaderSize{kUncompressedSizeOffset + 4 + 4 + 8};
+
+// chunk size for flushing constant values
+constexpr int32_t kChunkBytes = 4096;
+
+static inline const std::string_view kByteArray{"BYTE_ARRAY"};
+static inline const std::string_view kShortArray{"SHORT_ARRAY"};
+static inline const std::string_view kIntArray{"INT_ARRAY"};
+static inline const std::string_view kLongArray{"LONG_ARRAY"};
+static inline const std::string_view kInt128Array{"INT128_ARRAY"};
+static inline const std::string_view kVariableWidth{"VARIABLE_WIDTH"};
+static inline const std::string_view kRow{"ROW"};
+
+inline void writeInt32(OutputStream* out, int32_t value) {
+  out->write(reinterpret_cast<const char*>(&value), sizeof(value));
+}
+
+inline void writeInt64(OutputStream* out, int64_t value) {
+  out->write(reinterpret_cast<const char*>(&value), sizeof(value));
+}
+
+char getCodecMarker(bool checksumEnabled) {
+  char marker = 0;
+  if (checksumEnabled) {
+    marker |= kCheckSumBitMask;
+  }
+  return marker;
+}
+
+std::string_view typeToEncodingName(const TypePtr& type) {
+  switch (type->kind()) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+      return kByteArray;
+    case TypeKind::SMALLINT:
+      return kShortArray;
+    case TypeKind::INTEGER:
+    case TypeKind::REAL:
+      return kIntArray;
+    case TypeKind::BIGINT:
+    case TypeKind::DOUBLE:
+    case TypeKind::TIMESTAMP:
+      return kLongArray;
+    case TypeKind::HUGEINT:
+      return kInt128Array;
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      return kVariableWidth;
+    case TypeKind::ROW:
+      return kRow;
+    default:
+      VELOX_FAIL("Unsupported type kind: {}", static_cast<int>(type->kind()));
+  }
+}
+
+/// Finalizes the Presto page CRC by mixing in the codec marker, row count,
+/// and uncompressed size on top of the listener's accumulated data checksum.
+int64_t computeChecksum(
+    PrestoOutputStreamListener& listener,
+    int8_t codecMarker,
+    int32_t numRows,
+    int32_t uncompressedSize) {
+  auto crc = listener.crc();
+  crc.process_bytes(&codecMarker, 1);
+  crc.process_bytes(&numRows, 4);
+  crc.process_bytes(&uncompressedSize, 4);
+  return static_cast<int64_t>(crc.checksum());
+}
+
+/// Returns the serialized byte width of a fixed-width type, matching the
+/// sizeof(T) used in flushFlatValues.
+int32_t fixedTypeWidth(TypeKind kind) {
+  switch (kind) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+      return 1;
+    case TypeKind::SMALLINT:
+      return 2;
+    case TypeKind::INTEGER:
+    case TypeKind::REAL:
+      return 4;
+    case TypeKind::BIGINT:
+    case TypeKind::DOUBLE:
+      return 8;
+    case TypeKind::TIMESTAMP:
+    case TypeKind::HUGEINT:
+      return 16;
+    default:
+      return 0;
+  }
+}
+
+/// Returns the exact bytes for one fixed-width column in one partition.
+int64_t
+simpleColumnBytes(const TypePtr& colType, int64_t numRows, int64_t numNulls) {
+  const auto encodingName = typeToEncodingName(colType);
+  return 4 + static_cast<int64_t>(encodingName.size()) + // header
+      4 + // rowCount
+      1 + // nullFlag
+      (numNulls > 0 ? bits::nbytes(numRows) : 0) + // null bitmap
+      (numRows - numNulls) * fixedTypeWidth(colType->kind()); // values
+}
+
+/// Returns the null counts if it can be derived without row-by-row checks,
+/// otherwise returns std::nullopt.
+std::optional<vector_size_t> countNulls(const BaseVector& vector) {
+  if (!vector.mayHaveNulls()) {
+    return 0;
+  }
+
+  if (const auto nullCount = vector.getNullCount()) {
+    return *nullCount;
+  }
+
+  switch (vector.encoding()) {
+    case VectorEncoding::Simple::FLAT:
+    case VectorEncoding::Simple::ROW:
+      return BaseVector::countNulls(vector.nulls(), vector.size());
+    case VectorEncoding::Simple::CONSTANT:
+      return vector.isNullAt(0) ? vector.size() : 0;
+    case VectorEncoding::Simple::DICTIONARY: {
+      vector_size_t nullCount = 0;
+      for (auto i = 0; i < vector.size(); ++i) {
+        nullCount += vector.isNullAt(i);
+      }
+      return nullCount;
+    }
+    default:
+      return std::nullopt;
+  }
+}
+
+/// Returns the maximum null-bitmap bytes for totalRows distributed across
+/// numPartitionsWithNulls partitions. This occurs when one row is put in each
+/// partition first, then one byte is added for every 8 remaining rows.
+int64_t maxBitmapBytes(int64_t totalRows, int64_t numPartitionsWithNulls) {
+  if (numPartitionsWithNulls == 0) {
+    return 0;
+  }
+  VELOX_DCHECK_LE(numPartitionsWithNulls, totalRows);
+  return numPartitionsWithNulls + (totalRows - numPartitionsWithNulls) / 8;
+}
+
+/// Base class for column nodes in the serializer's per-partition accounting.
+///
+/// A node tracks exact row, null, and byte counts for one column while
+/// appending partitioned vectors.
+class ColumnBufferState {
+ public:
+  ColumnBufferState(TypePtr type, uint32_t numPartitions)
+      : type_(std::move(type)),
+        numPartitions_(numPartitions),
+        rowsPerPartition_(numPartitions, 0),
+        nullsPerPartition_(numPartitions, 0),
+        bytesPerPartition_(numPartitions, 0) {}
+
+  virtual ~ColumnBufferState() = default;
+
+  static std::unique_ptr<ColumnBufferState> create(
+      const TypePtr& type,
+      uint32_t numPartitions);
+
+  virtual void append(const PartitionedVectorPtr& partitionedVector) = 0;
+
+  virtual void clear() {
+    std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0);
+    std::fill(nullsPerPartition_.begin(), nullsPerPartition_.end(), 0);
+    std::fill(bytesPerPartition_.begin(), bytesPerPartition_.end(), 0);
+    numNonEmptyPartitions_ = 0;
+    numPartitionsWithNulls_ = 0;
+  }
+
+  const std::vector<vector_size_t>& rowsPerPartition() const {
+    return rowsPerPartition_;
+  }
+
+  const std::vector<int64_t>& bytesPerPartition() const {
+    return bytesPerPartition_;
+  }
+
+  uint32_t numNonEmptyPartitions() const {
+    return numNonEmptyPartitions_;
+  }
+
+  uint32_t numPartitionsWithNulls() const {
+    return numPartitionsWithNulls_;
+  }
+
+  int64_t nullBitmapBytesBuffered() const {
+    int64_t total = 0;
+    for (auto p = 0; p < numPartitions_; ++p) {
+      if (nullsPerPartition_[p] > 0) {
+        total += bits::nbytes(rowsPerPartition_[p]);
+      }
+    }
+    return total;
+  }
+
+ protected:
+  const TypePtr type_;
+  const uint32_t numPartitions_;
+  std::vector<vector_size_t> rowsPerPartition_;
+  std::vector<vector_size_t> nullsPerPartition_;
+  std::vector<int64_t> bytesPerPartition_;
+
+  // count of partitions with at least one buffered row
+  uint32_t numNonEmptyPartitions_{0};
+
+  // count of partitions that require a null bitmap
+  uint32_t numPartitionsWithNulls_{0};
+};
+
+/// Buffer state for one fixed-width column.
+class FixedWidthBufferState : public ColumnBufferState {
+ public:
+  FixedWidthBufferState(TypePtr type, uint32_t numPartitions)
+      : ColumnBufferState(std::move(type), numPartitions) {}
+
+  void append(const PartitionedVectorPtr& partitionedVector) override {
+    for (auto p = 0; p < numPartitions_; ++p) {
+      const auto numRows = partitionedVector->numRowsAt(p);
+      if (numRows == 0) {
+        continue;
+      }
+
+      const auto numNulls = partitionedVector->numNullsAt(p);
+      auto& rows = rowsPerPartition_[p];
+      auto& nulls = nullsPerPartition_[p];
+
+      if (rows == 0) {
+        ++numNonEmptyPartitions_;
+      }
+      if (nulls == 0 && numNulls > 0) {
+        ++numPartitionsWithNulls_;
+      }
+      rows += numRows;
+      nulls += numNulls;
+      bytesPerPartition_[p] = simpleColumnBytes(type_, rows, nulls);
+    }
+  }
+};
+
+/// Buffer state for one VARCHAR or VARBINARY column.
+class VariableWidthBufferState : public ColumnBufferState {
+ public:
+  VariableWidthBufferState(TypePtr type, uint32_t numPartitions)
+      : ColumnBufferState(std::move(type), numPartitions) {}
+
+  void append(const PartitionedVectorPtr& partitionedVector) override {
+    VELOX_NYI(
+        "Variable-width columns are not yet supported by "
+        "PrestoIterativePartitioningSerializer::append");
+  }
+};
+
+std::unique_ptr<ColumnBufferState> ColumnBufferState::create(
+    const TypePtr& type,
+    uint32_t numPartitions) {
+  switch (type->kind()) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+    case TypeKind::SMALLINT:
+    case TypeKind::INTEGER:
+    case TypeKind::BIGINT:
+    case TypeKind::REAL:
+    case TypeKind::DOUBLE:
+    case TypeKind::HUGEINT:
+      return std::make_unique<FixedWidthBufferState>(type, numPartitions);
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      return std::make_unique<VariableWidthBufferState>(type, numPartitions);
+    case TypeKind::TIMESTAMP:
+    case TypeKind::ROW:
+    case TypeKind::ARRAY:
+    case TypeKind::MAP:
+      VELOX_NYI(
+          "Unsupported type kind for createColumnBufferState: {}",
+          type->kind());
+    default:
+      VELOX_UNSUPPORTED(
+          "Unsupported type kind for createColumnBufferState: {}",
+          type->kind());
+  }
+}
+
+} // namespace
+
+/// Top-level buffer state for one output page.
+///
+/// For each partition, tracks page-level headers and aggregates child column
+/// sizes.
+class BufferState {
+ public:
+  BufferState(
+      uint32_t numPartitions,
+      std::vector<std::unique_ptr<ColumnBufferState>> children)
+      : numPartitions_(numPartitions),
+        rowsPerPartition_(numPartitions, 0),
+        bytesPerPartition_(numPartitions, 0),
+        children_(std::move(children)) {}
+
+  static std::unique_ptr<BufferState> create(
+      const RowTypePtr& type,
+      uint32_t numPartitions);
+
+  void append(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<column_index_t>& outputToInputChannels) {
+    auto rowVector =
+        std::dynamic_pointer_cast<PartitionedRowVector>(partitionedVector);
+    VELOX_CHECK_NOT_NULL(rowVector);
+
+    rowsBuffered_ += partitionedVector->baseVector()->size();
+
+    for (column_index_t column = 0; column < children_.size(); ++column) {
+      const auto inputColumn = outputToInputChannels.empty()
+          ? column
+          : outputToInputChannels[column];
+      children_[column]->append(rowVector->childAt(inputColumn));
+    }
+
+    for (auto p = 0; p < numPartitions_; ++p) {
+      const auto numRows = partitionedVector->numRowsAt(p);
+      if (numRows == 0) {
+        continue;
+      }
+      if (rowsPerPartition_[p] == 0) {
+        ++numNonEmptyPartitions_;
+      }
+      rowsPerPartition_[p] += numRows;
+
+      int64_t partitionBytes = kHeaderSize + 4;
+      for (const auto& child : children_) {
+        partitionBytes += child->bytesPerPartition()[p];
+      }
+      bytesBuffered_ += partitionBytes - bytesPerPartition_[p];
+      bytesPerPartition_[p] = partitionBytes;
+    }
+  }
+
+  void clear() {
+    std::fill(rowsPerPartition_.begin(), rowsPerPartition_.end(), 0);
+    std::fill(bytesPerPartition_.begin(), bytesPerPartition_.end(), 0);
+    numNonEmptyPartitions_ = 0;
+    rowsBuffered_ = 0;
+    bytesBuffered_ = 0;
+    for (auto& child : children_) {
+      child->clear();
+    }
+  }
+
+  const std::vector<vector_size_t>& rowsPerPartition() const {
+    return rowsPerPartition_;
+  }
+
+  const std::vector<int64_t>& bytesPerPartition() const {
+    return bytesPerPartition_;
+  }
+
+  uint32_t numNonEmptyPartitions() const {
+    return numNonEmptyPartitions_;
+  }
+
+  vector_size_t rowsBuffered() const {
+    return rowsBuffered_;
+  }
+
+  int64_t bytesBuffered() const {
+    return bytesBuffered_;
+  }
+
+  const std::vector<std::unique_ptr<ColumnBufferState>>& children() const {
+    return children_;
+  }
+
+ private:
+  const uint32_t numPartitions_;
+  std::vector<vector_size_t> rowsPerPartition_;
+  std::vector<int64_t> bytesPerPartition_;
+  uint32_t numNonEmptyPartitions_{0};
+  vector_size_t rowsBuffered_{0};
+  int64_t bytesBuffered_{0};
+  std::vector<std::unique_ptr<ColumnBufferState>> children_;
+};
+
+std::unique_ptr<BufferState> BufferState::create(
+    const RowTypePtr& type,
+    uint32_t numPartitions) {
+  std::vector<std::unique_ptr<ColumnBufferState>> children;
+  children.reserve(type->size());
+  for (auto column = 0; column < type->size(); ++column) {
+    children.push_back(
+        ColumnBufferState::create(type->childAt(column), numPartitions));
+  }
+  return std::make_unique<BufferState>(numPartitions, std::move(children));
+}
+
+PrestoIterativePartitioningSerializer::PrestoIterativePartitioningSerializer(
+    RowTypePtr outputType,
+    uint32_t numPartitions,
+    const SerdeOpts& opts,
+    memory::MemoryPool* pool,
+    std::vector<column_index_t> outputToInputChannels,
+    std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory)
+    : outputType_(std::move(outputType)),
+      outputToInputChannels_(std::move(outputToInputChannels)),
+      numPartitions_(numPartitions),
+      opts_(opts),
+      pool_(pool),
+      listenerFactory_(std::move(listenerFactory)),
+      numColumns_(outputType_->size()),
+      bufferState_(BufferState::create(outputType_, numPartitions_)) {
+  VELOX_CHECK_GT(numPartitions_, 0);
+  VELOX_CHECK_NOT_NULL(pool_);
+  VELOX_CHECK(
+      outputToInputChannels_.empty() ||
+          outputToInputChannels_.size() == outputType_->size(),
+      "outputToInputChannels size must match output column count");
+}
+
+PrestoIterativePartitioningSerializer::
+    ~PrestoIterativePartitioningSerializer() = default;
+
+int64_t PrestoIterativePartitioningSerializer::bytesBuffered() const {
+  return bufferState_->bytesBuffered();
+}
+
+vector_size_t PrestoIterativePartitioningSerializer::rowsBuffered() const {
+  return bufferState_->rowsBuffered();
+}
+
+void PrestoIterativePartitioningSerializer::clear() {
+  partitionedRowVectors_.clear();
+  bufferState_->clear();
+}
+
+void PrestoIterativePartitioningSerializer::validateOutputInputMapping(
+    const RowVectorPtr& input) const {
+  const auto numInputColumns = input->childrenSize();
+  for (column_index_t outputColumn = 0; outputColumn < numColumns_;
+       ++outputColumn) {
+    const auto inputColumn = outputToInputChannel(outputColumn);
+    VELOX_CHECK_LT(
+        inputColumn,
+        numInputColumns,
+        "Output column {} maps to invalid input column {}",
+        outputColumn,
+        inputColumn);
+
+    const auto& child = input->childAt(inputColumn);
+    VELOX_CHECK_NOT_NULL(
+        child,
+        "Output column {} maps to null input column {}",
+        outputColumn,
+        inputColumn);
+
+    const auto type = outputType_->childAt(outputColumn);
+    VELOX_CHECK(
+        child->type()->equivalent(*type),
+        "Output column {} expects {}, got {} from input column {}",
+        outputColumn,
+        type->toString(),
+        child->type()->toString(),
+        inputColumn);
+  }
+}
+
+int64_t PrestoIterativePartitioningSerializer::estimateBytesAfterAppend(
+    const RowVectorPtr& input) const {
+  VELOX_CHECK_NOT_NULL(input);
+  validateOutputInputMapping(input);
+
+  if (input->size() == 0) {
+    return bytesBuffered();
+  }
+
+  const auto numRows = input->size();
+
+  // Worst case: each input row lands in a distinct empty partition, capped by
+  // the number of empty partitions.
+  const auto numNewPartitions = std::min<uint32_t>(
+      numRows, numPartitions_ - bufferState_->numNonEmptyPartitions());
+  // One page header per newly non-empty partition.
+  auto estimatedBytes =
+      bufferState_->bytesBuffered() + numNewPartitions * (kHeaderSize + 4);
+
+  // Cache per input column. If multiple output columns map to the same input
+  // column, reuse the already computed incremental bytes.
+  std::vector<std::optional<int64_t>> estimatedIncrementalBytes(
+      input->childrenSize());
+  for (column_index_t column = 0; column < numColumns_; ++column) {
+    const auto inputColumn = outputToInputChannel(column);
+    if (estimatedIncrementalBytes[inputColumn].has_value()) {
+      estimatedBytes += *estimatedIncrementalBytes[inputColumn];
+      continue;
+    }
+    const auto& columnType = outputType_->childAt(column);
+    if (columnType->isUnknown()) {
+      VELOX_UNSUPPORTED(
+          "Unsupported type kind for "
+          "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}",
+          columnType->kind());
+    } else if (columnType->isFixedWidth()) {
+      const auto* columnState = bufferState_->children()[column].get();
+      const auto inputNulls = countNulls(*input->childAt(inputColumn));
+      const auto partitionsWithNulls = std::min<uint32_t>(
+          bufferState_->numNonEmptyPartitions() + numNewPartitions,
+          columnState->numPartitionsWithNulls() + inputNulls.value_or(numRows));
+      const auto nullBitmapBytes = maxBitmapBytes(
+          bufferState_->rowsBuffered() + numRows, partitionsWithNulls);
+      auto nullBitmapBytesBuffered = columnState->nullBitmapBytesBuffered();
+      VELOX_DCHECK_GE(nullBitmapBytes, nullBitmapBytesBuffered);
+
+      estimatedIncrementalBytes[inputColumn] = numNewPartitions *
+              simpleColumnBytes(columnType, 0, 0) + // header growth
+          nullBitmapBytes -
+          nullBitmapBytesBuffered + // null bitmap growth
+          static_cast<int64_t>(numRows - inputNulls.value_or(0)) *
+              fixedTypeWidth(columnType->kind()); // value bytes growth
+      estimatedBytes += *estimatedIncrementalBytes[inputColumn];
+    } else {
+      switch (columnType->kind()) {
+        case TypeKind::VARCHAR:
+        case TypeKind::VARBINARY:
+        case TypeKind::ROW:
+        case TypeKind::ARRAY:
+        case TypeKind::MAP:
+          VELOX_NYI(
+              "Unsupported type kind for "
+              "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}",
+              columnType->kind());
+        default:
+          VELOX_UNSUPPORTED(
+              "Unsupported type kind for "
+              "PrestoIterativePartitioningSerializer::estimateBytesAfterAppend: {}",
+              columnType->kind());
+      }
+    }
+  }
+  return estimatedBytes;
+}
+
+void PrestoIterativePartitioningSerializer::append(
+    const RowVectorPtr& input,
+    const std::vector<uint32_t>& partitions) {
+  VELOX_CHECK_NOT_NULL(input);
+  VELOX_CHECK_EQ(
+      input->size(),
+      partitions.size(),
+      "partitions.size() must equal input->size()");
+
+  validateOutputInputMapping(input);
+
+  if (input->size() == 0) {
+    return;
+  }
+
+  PartitionBuildContext ctx;
+  auto partitionedRowVector = PartitionedVector::create(
+      std::static_pointer_cast<BaseVector>(input),
+      partitions,
+      numPartitions_,
+      ctx,
+      pool_);
+
+  bufferState_->append(partitionedRowVector, outputToInputChannels_);
+  partitionedRowVectors_.push_back(std::move(partitionedRowVector));
+}
+
+// ---------------------------------------------------------------------------
+// Top-level flush
+// ---------------------------------------------------------------------------
+
+std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+PrestoIterativePartitioningSerializer::flush() {
+  auto pages =
+      (opts_.compressionKind == common::CompressionKind::CompressionKind_NONE)
+      ? flushUncompressed()
+      : flushCompressed();
+
+  clear();
+
+  return pages;
+}
+
+std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+PrestoIterativePartitioningSerializer::flushUncompressed() {
+  if (partitionedRowVectors_.empty()) {
+    return {};
+  }
+
+  // 1. Determine non-empty partitions.
+  std::vector<uint32_t> nonEmptyPartitions;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    if (bufferState_->rowsPerPartition()[p] > 0) {
+      nonEmptyPartitions.push_back(p);
+    }
+  }
+  const auto& rowSchema = outputType_->asRow();
+
+  // 2. Create per-partition listeners first so the codec mask can be derived
+  // from whether the factory actually produced a listener. The factory may
+  // return nullptr (e.g. when OutputBufferManager has no listener factory
+  // set), in which case checksumming is skipped and the checksum bit must not
+  // be set in the codec byte.
+  std::vector<std::unique_ptr<OutputStreamListener>> listeners(numPartitions_);
+  for (uint32_t p : nonEmptyPartitions) {
+    if (listenerFactory_) {
+      listeners[p] = listenerFactory_();
+    }
+  }
+  const bool checksumEnabled = !nonEmptyPartitions.empty() &&
+      listeners[nonEmptyPartitions[0]] != nullptr;
+  const char codecMask = getCodecMarker(checksumEnabled);
+
+  // 3. Create output streams sized to the exact bytes each partition will need,
+  // so that the entire payload fits. This avoids multiple resizing and copying.
+  std::vector<std::unique_ptr<IOBufOutputStream>> outputStreams(numPartitions_);
+  std::vector<IOBufOutputStream*> rawOutputStreams(numPartitions_);
+  std::vector<std::streampos> beginStreamPositions(numPartitions_);
+
+  for (uint32_t p : nonEmptyPartitions) {
+    outputStreams[p] = std::make_unique<IOBufOutputStream>(
+        *pool_, listeners[p].get(), bufferState_->bytesPerPartition()[p]);
+    rawOutputStreams[p] = outputStreams[p].get();
+    beginStreamPositions[p] = outputStreams[p]->tellp();
+
+    flushStart(*outputStreams[p], p, codecMask);
+  }
+
+  // 4. Flush column data.
+  flushRowChildren(
+      partitionedRowVectors_, rowSchema, nonEmptyPartitions, rawOutputStreams);
+
+  // 5. Finalize the page by seeking back to fill in sizes and CRC, and get the
+  // IOBuf and numOfRows from each stream.
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+      result;
+  for (uint32_t p : nonEmptyPartitions) {
+    flushFinish(
+        *outputStreams[p],
+        p,
+        beginStreamPositions[p],
+        codecMask,
+        listeners[p].get());
+    result[p] = std::make_pair(
+        outputStreams[p]->getIOBuf(), bufferState_->rowsPerPartition()[p]);
+  }
+
+  return result;
+}
+
+std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+PrestoIterativePartitioningSerializer::flushCompressed() {
+  VELOX_NYI();
+}
+
+// ---------------------------------------------------------------------------
+// Second level functions: start, columns and finish
+// ---------------------------------------------------------------------------
+
+void PrestoIterativePartitioningSerializer::flushStart(
+    IOBufOutputStream& out,
+    uint32_t partition,
+    char codecMask) const {
+  auto* listener = dynamic_cast<PrestoOutputStreamListener*>(out.listener());
+  if (listener) {
+    listener->pause();
+  }
+
+  // Write 21-byte Presto page header; sizes and CRC are filled in later.
+  const int32_t numRows =
+      static_cast<int32_t>(bufferState_->rowsPerPartition()[partition]);
+  char header[kHeaderSize] = {};
+  std::memcpy(&header[0], &numRows, 4);
+  std::memcpy(&header[4], &codecMask, 1);
+  out.write(header, kHeaderSize);
+
+  if (listener) {
+    listener->resume();
+  }
+
+  // Number of columns is included in the CRC.
+  const int32_t numCols = static_cast<int32_t>(numColumns_);
+  out.write(reinterpret_cast<const char*>(&numCols), 4);
+}
+
+void PrestoIterativePartitioningSerializer::flushRowChildren(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const RowType& rowSchema,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  for (uint32_t col = 0; col < rowSchema.size(); ++col) {
+    std::vector<PartitionedVectorPtr> column;
+    column.reserve(partitionedVectors.size());
+    for (const auto& partitionedVector : partitionedVectors) {
+      const auto& partitionedRowVector =
+          std::dynamic_pointer_cast<PartitionedRowVector>(partitionedVector);
+      VELOX_DCHECK_NOT_NULL(partitionedRowVector.get());
+      column.push_back(
+          partitionedRowVector->childAt(outputToInputChannel(col)));
+    }
+
+    flushColumn(
+        column, rowSchema.childAt(col), nonEmptyPartitions, outputStreams);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushFinish(
+    IOBufOutputStream& out,
+    uint32_t partition,
+    std::streampos beginOffset,
+    char codecMask,
+    OutputStreamListener* listener) const {
+  auto* prestoListener = dynamic_cast<PrestoOutputStreamListener*>(listener);
+  if (prestoListener) {
+    prestoListener->pause();
+  }
+
+  const std::streampos totalSize =
+      static_cast<int32_t>(out.tellp() - beginOffset);
+  const std::streampos uncompressedSize = totalSize - kHeaderSize;
+  int64_t crc = 0;
+  if (prestoListener) {
+    crc = computeChecksum(
+        *prestoListener,
+        static_cast<int8_t>(codecMask),
+        static_cast<int32_t>(bufferState_->rowsPerPartition()[partition]),
+        uncompressedSize);
+  }
+
+  out.seekp(beginOffset + kUncompressedSizeOffset);
+  writeInt32(&out, uncompressedSize);
+  writeInt32(&out, uncompressedSize); // TODO: compressedSize
+  writeInt64(&out, crc);
+  out.seekp(beginOffset + totalSize);
+}
+
+// ---------------------------------------------------------------------------
+// Column-level dispatch
+// ---------------------------------------------------------------------------
+
+void PrestoIterativePartitioningSerializer::flushColumn(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const TypePtr& colType,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  VELOX_CHECK_GT(partitionedVectors.size(), 0);
+
+  auto typeKind = partitionedVectors[0]->baseVector()->typeKind();
+  switch (typeKind) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+    case TypeKind::SMALLINT:
+    case TypeKind::INTEGER:
+    case TypeKind::BIGINT:
+    case TypeKind::REAL:
+    case TypeKind::DOUBLE:
+    case TypeKind::HUGEINT:
+      flushSimpleColumn(
+          partitionedVectors, colType, nonEmptyPartitions, outputStreams);
+      break;
+
+    case TypeKind::TIMESTAMP:
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+    case TypeKind::ROW:
+    case TypeKind::ARRAY:
+    case TypeKind::MAP:
+      VELOX_NYI(
+          "Unsupported vector type kind for PrestoIterativePartitioningSerializer: {}",
+          typeKind);
+
+    default:
+      VELOX_UNSUPPORTED(
+          "Invalid vector type kind for PrestoIterativePartitioningSerializer: {}",
+          typeKind);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSimpleColumn(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const TypePtr& colType,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  flushHeader(typeToEncodingName(colType), nonEmptyPartitions, outputStreams);
+  flushRowCounts(nonEmptyPartitions, outputStreams);
+  flushNulls(partitionedVectors, nonEmptyPartitions, outputStreams);
+
+  for (size_t i = 0; i < partitionedVectors.size(); i++) {
+    flushSingleSimpleVector(partitionedVectors[i], outputStreams);
+  }
+}
+
+template <TypeKind kind>
+void PrestoIterativePartitioningSerializer::flushSingleFlatVector(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  using T = typename TypeTraits<kind>::NativeType;
+  auto* flatVector = partitionedVector->as<PartitionedFlatVector<T>>();
+  VELOX_DCHECK_NOT_NULL(flatVector);
+
+  const auto* rawValues =
+      flatVector->baseVector()->template as<FlatVector<T>>()->rawValues();
+  const auto* rawNulls = flatVector->baseVector()->rawNulls();
+  const auto* partitionOffsets = flatVector->rawPartitionOffsets();
+
+  flushFlatValues<T>(rawValues, rawNulls, partitionOffsets, outputStreams);
+}
+
+// BOOLEAN columns use kByteArray encoding: FlatVector<bool> stores bits
+// packed, so rawValues() is unsupported. Each non-null value is written as
+// one byte (0x00 or 0x01).
+template <>
+void PrestoIterativePartitioningSerializer::flushSingleFlatVector<
+    TypeKind::BOOLEAN>(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  auto* flatVector = partitionedVector->as<PartitionedFlatVector<bool>>();
+  VELOX_DCHECK_NOT_NULL(flatVector);
+
+  const auto* rawBoolValues =
+      flatVector->baseVector()->as<FlatVector<bool>>()->rawValues<uint64_t>();
+  const auto* rawNulls = flatVector->baseVector()->rawNulls();
+  const auto* partitionOffsets = flatVector->rawPartitionOffsets();
+
+  // TODO: Improve performance
+  vector_size_t lastOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const auto offset = partitionOffsets[p];
+    const auto numValues = offset - lastOffset;
+    const auto numNulls = partitionedVector->numNullsAt(p);
+    if (outputStreams[p] != nullptr && numValues > 0) {
+      if (numNulls == 0) {
+        for (vector_size_t i = lastOffset; i < offset; ++i) {
+          const int8_t val = bits::isBitSet(rawBoolValues, i) ? 1 : 0;
+          outputStreams[p]->write(reinterpret_cast<const char*>(&val), 1);
+        }
+      } else {
+        VELOX_DCHECK_NOT_NULL(rawNulls);
+        for (vector_size_t i = lastOffset; i < offset; ++i) {
+          if (!bits::isBitNull(rawNulls, i)) {
+            const int8_t val = bits::isBitSet(rawBoolValues, i) ? 1 : 0;
+            outputStreams[p]->write(reinterpret_cast<const char*>(&val), 1);
+          }
+        }
+      }
+    }
+    lastOffset = offset;
+  }
+}
+
+template <TypeKind kind>
+void PrestoIterativePartitioningSerializer::flushSingleConstantVector(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  if constexpr (
+      kind == TypeKind::VARCHAR || kind == TypeKind::VARBINARY ||
+      kind == TypeKind::TIMESTAMP) {
+    VELOX_NYI(
+        "flushSingleConstantVector does not support variable-length type: {}",
+        kind);
+  }
+
+  using T = typename TypeTraits<kind>::NativeType;
+  auto* constantVector =
+      partitionedVector->baseVector()->template as<ConstantVector<T>>();
+  VELOX_DCHECK_NOT_NULL(constantVector);
+
+  if (constantVector->isNullAt(0)) {
+    return;
+  }
+
+  const auto value = constantVector->valueAtFast(0);
+  const auto* partitionOffsets = partitionedVector->rawPartitionOffsets();
+
+  Scratch scratch;
+  ScratchPtr<T> values(scratch);
+  const auto numRowsPerChunk =
+      std::max<vector_size_t>(1, kChunkBytes / sizeof(T));
+  const char* chunkBytes = nullptr;
+
+  vector_size_t lastOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const auto offset = partitionOffsets[p];
+    auto numRows = offset - lastOffset;
+    if (numRows > 0) {
+      VELOX_DCHECK_NOT_NULL(outputStreams[p]);
+
+      if (chunkBytes == nullptr) {
+        auto* ptr = values.get(numRowsPerChunk);
+        std::fill_n(ptr, numRowsPerChunk, value);
+        chunkBytes = reinterpret_cast<const char*>(ptr);
+      }
+
+      while (numRows > 0) {
+        auto n = std::min<vector_size_t>(numRowsPerChunk, numRows);
+        outputStreams[p]->write(chunkBytes, n * sizeof(T));
+        numRows -= n;
+      }
+    }
+    lastOffset = offset;
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSingleSimpleVector(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  auto encoding = partitionedVector->baseVector()->encoding();
+  auto typeKind = partitionedVector->baseVector()->typeKind();
+
+  switch (encoding) {
+    case VectorEncoding::Simple::FLAT:
+      VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+          flushSingleFlatVector, typeKind, partitionedVector, outputStreams);
+      break;
+    case VectorEncoding::Simple::CONSTANT:
+      VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+          flushSingleConstantVector,
+          typeKind,
+          partitionedVector,
+          outputStreams);
+      break;
+    case VectorEncoding::Simple::BIASED:
+    case VectorEncoding::Simple::DICTIONARY:
+    case VectorEncoding::Simple::SEQUENCE:
+      VELOX_NYI(
+          "Unsupported vector encoding for PrestoIterativePartitioningSerializer: {}",
+          encoding);
+    default:
+      VELOX_UNSUPPORTED(
+          "Invalid vector encoding for PrestoIterativePartitioningSerializer:flushSingleSimpleVector: {}",
+          encoding);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Column building blocks
+// ---------------------------------------------------------------------------
+
+void PrestoIterativePartitioningSerializer::flushHeader(
+    std::string_view name,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  const int32_t nameLen = static_cast<int32_t>(name.size());
+  for (uint32_t p : nonEmptyPartitions) {
+    writeInt32(outputStreams[p], nameLen);
+    outputStreams[p]->write(name.data(), nameLen);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushRowCounts(
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  for (uint32_t p : nonEmptyPartitions) {
+    writeInt32(
+        outputStreams[p],
+        static_cast<int32_t>(bufferState_->rowsPerPartition()[p]));
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushNulls(
+    const std::vector<PartitionedVectorPtr>& partitionedVectors,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  std::vector<vector_size_t> nullCounts(numPartitions_, 0);
+  for (uint32_t p : nonEmptyPartitions) {
+    for (const auto& pv : partitionedVectors) {
+      nullCounts[p] += pv->numNullsAt(p);
+    }
+    const char flagByte = nullCounts[p] > 0 ? 1 : 0;
+    outputStreams[p]->write(&flagByte, 1);
+  }
+
+  const bool hasAnyNulls = std::any_of(
+      nonEmptyPartitions.begin(), nonEmptyPartitions.end(), [&](uint32_t p) {
+        return nullCounts[p] > 0;
+      });
+  if (!hasAnyNulls) {
+    return;
+  }
+
+  // Build each partition's null bitmap in a temporary buffer, accumulating
+  // bits across all batches. Writing via write() correctly handles range
+  // boundaries in the output stream without requiring seekp().
+  // TODO: Avoid this extra memory allocation and copy
+  std::vector<std::vector<uint8_t>> bitmaps(numPartitions_);
+  for (uint32_t p : nonEmptyPartitions) {
+    if (nullCounts[p] > 0) {
+      bitmaps[p].assign(
+          bits::nbytes(bufferState_->rowsPerPartition()[p]),
+          bits::kNotNullByte);
+    }
+  }
+
+  std::vector<vector_size_t> destBitOffsets(numPartitions_, 0);
+  for (const auto& pv : partitionedVectors) {
+    auto encoding = pv->baseVector()->encoding();
+    switch (encoding) {
+      case VectorEncoding::Simple::FLAT:
+        flushSimpleVectorNulls(pv, nonEmptyPartitions, bitmaps, destBitOffsets);
+        break;
+      case VectorEncoding::Simple::CONSTANT:
+        flushConstantVectorNulls(
+            pv, nonEmptyPartitions, bitmaps, destBitOffsets);
+        break;
+      case VectorEncoding::Simple::BIASED:
+      case VectorEncoding::Simple::DICTIONARY:
+      case VectorEncoding::Simple::SEQUENCE:
+        VELOX_NYI(
+            "Unsupported vector encoding for PrestoIterativePartitioningSerializer: {}",
+            encoding);
+      default:
+        VELOX_UNSUPPORTED(
+            "Invalid vector encoding for PrestoIterativePartitioningSerializer: {}",
+            encoding);
+    }
+  }
+
+  for (uint32_t p : nonEmptyPartitions) {
+    if (nullCounts[p] == 0) {
+      continue;
+    }
+
+    // Convert Velox format (LSB-first, 1=not-null) to Presto wire format
+    // (MSB-first, 1=null) in-place.
+    const int32_t numBytes = bits::nbytes(bufferState_->rowsPerPartition()[p]);
+    for (int32_t i = 0; i < numBytes; ++i) {
+      bitmaps[p][i] = ~bitmaps[p][i];
+      bits::reverseBits(&bitmaps[p][i], 1);
+    }
+
+    outputStreams[p]->write(
+        reinterpret_cast<const char*>(bitmaps[p].data()), numBytes);
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSimpleVectorNulls(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    std::vector<std::vector<uint8_t>>& bitmaps,
+    std::vector<vector_size_t>& destBitOffsets) {
+  const uint64_t* rawNulls = partitionedVector->baseVector()->rawNulls();
+  const auto* rawPartitionOffsets = partitionedVector->rawPartitionOffsets();
+  vector_size_t startBit = 0;
+  for (uint32_t p : nonEmptyPartitions) {
+    vector_size_t numBits = rawPartitionOffsets[p] - startBit;
+    if (rawNulls && numBits > 0 && !bitmaps[p].empty()) {
+      bits::copyBits(
+          rawNulls,
+          startBit,
+          reinterpret_cast<uint64_t*>(bitmaps[p].data()),
+          destBitOffsets[p],
+          numBits);
+    }
+    if (!bitmaps[p].empty()) {
+      destBitOffsets[p] += numBits;
+    }
+    startBit = rawPartitionOffsets[p];
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushConstantVectorNulls(
+    const PartitionedVectorPtr& partitionedVector,
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    std::vector<std::vector<uint8_t>>& bitmaps,
+    std::vector<vector_size_t>& destBitOffsets) {
+  const bool isNullConstant = partitionedVector->baseVector()->isNullAt(0);
+  const auto* rawPartitionOffsets = partitionedVector->rawPartitionOffsets();
+  vector_size_t startBit = 0;
+  for (uint32_t p : nonEmptyPartitions) {
+    vector_size_t numBits = rawPartitionOffsets[p] - startBit;
+    if (isNullConstant && numBits > 0 && !bitmaps[p].empty()) {
+      bits::fillBits(
+          reinterpret_cast<uint64_t*>(bitmaps[p].data()),
+          destBitOffsets[p],
+          destBitOffsets[p] + numBits,
+          bits::kNull);
+    }
+    if (!bitmaps[p].empty()) {
+      destBitOffsets[p] += numBits;
+    }
+    startBit = rawPartitionOffsets[p];
+  }
+}
+
+template <typename T>
+void PrestoIterativePartitioningSerializer::flushFlatValues(
+    const T* partitionedValues,
+    const uint64_t* rawNulls,
+    const vector_size_t* partitionOffsets,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  const auto typeWidth = sizeof(T);
+  vector_size_t lastOffset = 0;
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const auto offset = partitionOffsets[p];
+    const auto numValues = offset - lastOffset;
+    if (outputStreams[p] != nullptr && numValues > 0) {
+      if (!rawNulls) {
+        outputStreams[p]->write(
+            reinterpret_cast<const char*>(&partitionedValues[lastOffset]),
+            numValues * typeWidth);
+      } else {
+        // Presto writes only non-null values; null slots are omitted.
+        // TODO: Improve performance
+        for (vector_size_t i = lastOffset; i < offset; ++i) {
+          if (!bits::isBitNull(rawNulls, i)) {
+            outputStreams[p]->write(
+                reinterpret_cast<const char*>(&partitionedValues[i]),
+                typeWidth);
+          }
+        }
+      }
+    }
+    lastOffset = offset;
+  }
+}
+
+void PrestoIterativePartitioningSerializer::flushSequentialOffsets(
+    const std::vector<uint32_t>& nonEmptyPartitions,
+    const std::vector<IOBufOutputStream*>& outputStreams) const {
+  for (uint32_t p : nonEmptyPartitions) {
+    const int32_t numRows =
+        static_cast<int32_t>(bufferState_->rowsPerPartition()[p]);
+    for (int32_t i = 0; i <= numRows; ++i) {
+      writeInt32(outputStreams[p], i);
+    }
+  }
+}
+
+} // namespace facebook::velox::serializer::presto
diff --git a/velox/serializers/PrestoIterativePartitioningSerializer.h b/velox/serializers/PrestoIterativePartitioningSerializer.h
new file mode 100644
index 00000000000..8ab7d31dc7e
--- /dev/null
+++ b/velox/serializers/PrestoIterativePartitioningSerializer.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include <folly/io/IOBuf.h>
+
+#include "velox/common/memory/ByteStream.h"
+#include "velox/serializers/PrestoSerializer.h"
+#include "velox/type/Type.h"
+#include "velox/vector/PartitionedVector.h"
+
+namespace facebook::velox::serializer::presto {
+
+/// Convenience alias matching PrestoSerializer.cpp convention.
+using SerdeOpts = PrestoVectorSerde::PrestoOptions;
+
+class BufferState;
+
+/// Serializes a stream of RowVectors into per-partition Presto pages.
+///
+/// Each call to append() routes rows to their assigned partition. flush()
+/// produces one Presto-format IOBuf per non-empty partition and resets the
+/// internal state so the serializer can be reused for the next cycle.
+class PrestoIterativePartitioningSerializer {
+ public:
+  PrestoIterativePartitioningSerializer(
+      RowTypePtr outputType,
+      uint32_t numPartitions,
+      const SerdeOpts& opts,
+      memory::MemoryPool* pool)
+      : PrestoIterativePartitioningSerializer(
+            std::move(outputType),
+            numPartitions,
+            opts,
+            pool,
+            {},
+            nullptr) {}
+
+  /// Constructs the serializer. If `listenerFactory` is non-null it is called
+  /// once per non-empty partition on each flush to create an
+  /// OutputStreamListener that accumulates the CRC32 checksum; the checksum
+  /// bit is then set in the Presto page codec byte and the computed value is
+  /// written into the page header. Pass nullptr to skip checksum computation,
+  /// which matches the behavior of kNormal PartitionedOutput when
+  /// OutputBufferManager has no listener factory set.
+  PrestoIterativePartitioningSerializer(
+      RowTypePtr outputType,
+      uint32_t numPartitions,
+      const SerdeOpts& opts,
+      memory::MemoryPool* pool,
+      std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory)
+      : PrestoIterativePartitioningSerializer(
+            std::move(outputType),
+            numPartitions,
+            opts,
+            pool,
+            {},
+            std::move(listenerFactory)) {}
+
+  /// Constructs the serializer with an explicit output-column to input-column
+  /// mapping. `outputToInputChannels[i]` indicates which child of the RowVector
+  /// passed to append() should be serialized for output column i. When empty,
+  /// output column i uses input child i.
+  PrestoIterativePartitioningSerializer(
+      RowTypePtr outputType,
+      uint32_t numPartitions,
+      const SerdeOpts& opts,
+      memory::MemoryPool* pool,
+      std::vector<column_index_t> outputToInputChannels,
+      std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory =
+          nullptr);
+
+  ~PrestoIterativePartitioningSerializer();
+
+  /// Returns a conservative estimate of bytesBuffered() after appending
+  /// `input`. The partition assignment of the input is not known at the time of
+  /// the call, so this assumes worst-case growth from new non-empty partitions
+  /// and may overestimate.
+  int64_t estimateBytesAfterAppend(const RowVectorPtr& input) const;
+
+  /// Routes each row in `input` to the partition indicated by `partitions`.
+  /// `partitions.size()` must equal `input->size()`.
+  void append(
+      const RowVectorPtr& input,
+      const std::vector<uint32_t>& partitions);
+
+  /// Serializes all buffered data into one Presto page per non-empty partition
+  /// and resets internal state. Returns an empty map if nothing has been
+  /// appended since the last flush.
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+  flush();
+
+  /// Returns the serialized bytes buffered across all partitions since the last
+  /// flush.
+  int64_t bytesBuffered() const;
+
+  /// Returns the total number of rows appended since the last flush.
+  vector_size_t rowsBuffered() const;
+
+ private:
+  void validateOutputInputMapping(const RowVectorPtr&) const;
+
+  column_index_t outputToInputChannel(column_index_t outputColumn) const {
+    return outputToInputChannels_.empty()
+        ? outputColumn
+        : outputToInputChannels_[outputColumn];
+  }
+
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+  flushUncompressed();
+  std::map<uint32_t, std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>
+  flushCompressed();
+
+  void clear();
+
+  void flushStart(IOBufOutputStream& out, uint32_t partition, char codecMask)
+      const;
+
+  void flushFinish(
+      IOBufOutputStream& out,
+      uint32_t partition,
+      std::streampos beginOffset,
+      char codecMask,
+      OutputStreamListener* listener) const;
+
+  void flushRowChildren(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const RowType& rowSchema,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushColumn(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const TypePtr& colType,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushSimpleColumn(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const TypePtr& colType,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushSingleSimpleVector(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  template <TypeKind kind>
+  void flushSingleFlatVector(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  template <TypeKind kind>
+  void flushSingleConstantVector(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushHeader(
+      std::string_view name,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushRowCounts(
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushNulls(
+      const std::vector<PartitionedVectorPtr>& partitionedVectors,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  static void flushSimpleVectorNulls(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      std::vector<std::vector<uint8_t>>& bitmaps,
+      std::vector<vector_size_t>& destBitOffsets);
+
+  static void flushConstantVectorNulls(
+      const PartitionedVectorPtr& partitionedVector,
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      std::vector<std::vector<uint8_t>>& bitmaps,
+      std::vector<vector_size_t>& destBitOffsets);
+
+  template <typename T>
+  void flushFlatValues(
+      const T* partitionedValues,
+      const uint64_t* rawNulls,
+      const vector_size_t* partitionOffsets,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  void flushSequentialOffsets(
+      const std::vector<uint32_t>& nonEmptyPartitions,
+      const std::vector<IOBufOutputStream*>& outputStreams) const;
+
+  RowTypePtr outputType_;
+  std::vector<column_index_t> outputToInputChannels_;
+  uint32_t numPartitions_;
+  SerdeOpts opts_;
+  memory::MemoryPool* pool_;
+
+  std::function<std::unique_ptr<OutputStreamListener>()> listenerFactory_;
+
+  /// Number of top-level columns in `outputType_`.
+  uint32_t numColumns_{0};
+
+  std::vector<PartitionedVectorPtr> partitionedRowVectors_;
+
+  /// Accumulated state for all batches buffered since the last
+  /// flush.
+  std::unique_ptr<BufferState> bufferState_;
+};
+
+} // namespace facebook::velox::serializer::presto
diff --git a/velox/serializers/benchmarks/CMakeLists.txt b/velox/serializers/benchmarks/CMakeLists.txt
index 7d1044e4367..a81530595e8 100644
--- a/velox/serializers/benchmarks/CMakeLists.txt
+++ b/velox/serializers/benchmarks/CMakeLists.txt
@@ -21,3 +21,17 @@ target_link_libraries(
   Folly::folly
   Folly::follybenchmark
 )
+
+add_executable(
+  velox_presto_iterative_partitioning_serializer_benchmark
+  PrestoIterativePartitioningSerializerBenchmark.cpp
+)
+
+target_link_libraries(
+  velox_presto_iterative_partitioning_serializer_benchmark
+  velox_presto_serializer
+  velox_vector_test_lib
+  velox_memory
+  Folly::folly
+  Folly::follybenchmark
+)
diff --git a/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
new file mode 100644
index 00000000000..ec6330f42ed
--- /dev/null
+++ b/velox/serializers/benchmarks/PrestoIterativePartitioningSerializerBenchmark.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::serializer::presto;
+
+constexpr int64_t kBufferSize = 2 * 1024 * 1024;
+
+namespace {
+
+class PrestoIterativePartitioningSerializerBenchmark
+    : public test::VectorTestBase {
+ public:
+  /// Creates a flat vector of type T with deterministic null pattern.
+  /// Rows where (row % 100) < nullPct are null.
+  template <typename T>
+  VectorPtr makeFlatColumnOfType(vector_size_t size, int32_t nullPct) {
+    if (nullPct == 0) {
+      return makeFlatVector<T>(
+          size, [](auto row) { return static_cast<T>(row); });
+    }
+    return makeFlatVector<T>(
+        size,
+        [](auto row) { return static_cast<T>(row); },
+        [nullPct](auto row) { return (row % 100) < nullPct; });
+  }
+
+  /// Creates a flat vector of the given TypeKind with the given null ratio.
+  VectorPtr
+  makeFlatColumn(vector_size_t size, TypeKind colKind, int32_t nullPct) {
+    switch (colKind) {
+      case TypeKind::BOOLEAN:
+        return makeFlatColumnOfType<bool>(size, nullPct);
+      case TypeKind::INTEGER:
+        return makeFlatColumnOfType<int32_t>(size, nullPct);
+      case TypeKind::BIGINT:
+        return makeFlatColumnOfType<int64_t>(size, nullPct);
+      case TypeKind::HUGEINT:
+        return makeFlatColumnOfType<int128_t>(size, nullPct);
+      default:
+        VELOX_UNSUPPORTED(
+            "Unsupported TypeKind: {}", TypeKindName::toName(colKind));
+    }
+  }
+
+  VectorPtr
+  makeConstantColumn(vector_size_t size, TypeKind colKind, bool nullConstant) {
+    if (nullConstant) {
+      return makeNullConstant(colKind, size);
+    }
+    switch (colKind) {
+      case TypeKind::BOOLEAN:
+        return makeConstant<bool>(true, size);
+      case TypeKind::INTEGER:
+        return makeConstant<int32_t>(42, size);
+      case TypeKind::BIGINT:
+        return makeConstant<int64_t>(1000, size);
+      case TypeKind::HUGEINT:
+        return makeConstant<int128_t>(10000, size);
+      default:
+        VELOX_UNSUPPORTED(
+            "Unsupported TypeKind: {}", TypeKindName::toName(colKind));
+    }
+  }
+
+  /// Creates a RowVector with numCols columns of the given TypeKind.
+  RowVectorPtr makeInput(
+      vector_size_t size,
+      VectorEncoding::Simple encoding,
+      TypeKind colKind,
+      uint32_t numCols,
+      int32_t nullPct,
+      bool nullConstant = false) {
+    std::vector<std::string> names;
+    std::vector<VectorPtr> children;
+    names.reserve(numCols);
+    children.reserve(numCols);
+    for (uint32_t i = 0; i < numCols; ++i) {
+      names.push_back(fmt::format("c{}", i));
+    }
+    switch (encoding) {
+      case VectorEncoding::Simple::FLAT: {
+        for (uint32_t i = 0; i < numCols; ++i) {
+          children.push_back(makeFlatColumn(size, colKind, nullPct));
+        }
+        break;
+      }
+      case VectorEncoding::Simple::CONSTANT: {
+        for (uint32_t i = 0; i < numCols; ++i) {
+          children.push_back(makeConstantColumn(size, colKind, nullConstant));
+        }
+        break;
+      }
+      default:
+        VELOX_UNSUPPORTED("Unsupported encoding: {}", encoding);
+    }
+    return makeRowVector(names, children);
+  }
+
+  std::vector<uint32_t> makePartitions(
+      vector_size_t size,
+      uint32_t numPartitions) {
+    std::vector<uint32_t> partitions(size);
+    for (vector_size_t i = 0; i < size; ++i) {
+      partitions[i] = i % numPartitions;
+    }
+    return partitions;
+  }
+
+  std::unique_ptr<PrestoIterativePartitioningSerializer> makeSerializer(
+      const RowTypePtr& type,
+      uint32_t numPartitions) {
+    SerdeOpts opts;
+    return std::make_unique<PrestoIterativePartitioningSerializer>(
+        type, numPartitions, opts, pool_.get());
+  }
+};
+
+} // namespace
+
+/// Single benchmark function parameterized by (encoding, colKind, numCols,
+/// nullPct, nullConstant, numPartitions). Registered via BENCHMARK_NAMED_PARAM
+/// below.
+///
+/// All runs use 10'000 rows. Setup (input creation, serializer construction,
+/// append) is excluded from the measured time.
+void benchmarkFlush(
+    VectorEncoding::Simple encoding,
+    TypeKind colKind,
+    uint32_t numCols,
+    int32_t nullPct,
+    bool nullConstant,
+    uint32_t numPartitions) {
+  folly::BenchmarkSuspender suspender;
+  PrestoIterativePartitioningSerializerBenchmark benchmark;
+  auto input = benchmark.makeInput(
+      10'000, encoding, colKind, numCols, nullPct, nullConstant);
+  auto parts = benchmark.makePartitions(10'000, numPartitions);
+  auto serializer = benchmark.makeSerializer(
+      std::static_pointer_cast<const RowType>(input->type()), numPartitions);
+
+  while (serializer->bytesBuffered() < kBufferSize) {
+    serializer->append(input, parts);
+  }
+
+  suspender.dismiss();
+
+  auto result = serializer->flush();
+  folly::doNotOptimizeAway(result);
+}
+
+void benchmarkFlushFlat(
+    uint32_t /* iters */,
+    TypeKind colKind,
+    uint32_t numCols,
+    int32_t nullPct,
+    uint32_t numPartitions) {
+  benchmarkFlush(
+      VectorEncoding::Simple::FLAT,
+      colKind,
+      numCols,
+      nullPct,
+      false,
+      numPartitions);
+}
+
+void benchmarkFlushConstant(
+    uint32_t /* iters */,
+    TypeKind colKind,
+    uint32_t numCols,
+    bool nullConstant,
+    uint32_t numPartitions) {
+  benchmarkFlush(
+      VectorEncoding::Simple::CONSTANT,
+      colKind,
+      numCols,
+      0,
+      nullConstant,
+      numPartitions);
+}
+
+// clang-format off
+// Dimensions:
+//   col type:       {bool, int, bigint, hugeint}
+//   num cols:       {1, 4, 16, 64}
+//   null pct:       {0, 25, 50, 75, 100}
+//   num partitions: {1, 4, 16, 64, 256, 1024}
+//
+// Naming: flat_<type>_<N>cols_<P>pct_<K>parts
+#define FLUSH_FLAT_PARAM(type_name, kind, num_cols, null_pct, num_parts)      \
+  BENCHMARK_NAMED_PARAM(                                                      \
+      benchmarkFlushFlat,                                                     \
+      type_name##_##num_cols##cols_##null_pct##pct_##num_parts##parts, \
+      TypeKind::kind,                                                         \
+      num_cols,                                                               \
+      null_pct,                                                               \
+      num_parts)
+
+// Dimensions:
+//   col type:       {bool, int, bigint, hugeint}
+//   num cols:       {1, 4, 16, 64}
+//   null constant:  {false, true}
+//   num partitions: {1, 4, 16, 64, 256, 1024}
+//
+// Naming: constant_<type>_<N>cols_[non_]null_<K>parts
+#define FLUSH_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)           \
+  BENCHMARK_NAMED_PARAM(                                                     \
+      benchmarkFlushConstant,                                                \
+      type_name##_##num_cols##cols_##notnull_##num_parts##parts,             \
+      TypeKind::kind,                                                        \
+      num_cols,                                                              \
+      false,                                                                 \
+      num_parts)
+
+#define FLUSH_NULL_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)  \
+  BENCHMARK_NAMED_PARAM(                                                 \
+      benchmarkFlushConstant,                                            \
+      type_name##_##num_cols##cols_##null_##num_parts##parts,            \
+      TypeKind::kind,                                                    \
+      num_cols,                                                          \
+      true,                                                              \
+      num_parts)
+
+#define FLUSH_FOR_NULLS(type_name, kind, num_cols, num_parts) \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 0, num_parts)   \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 25, num_parts)  \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 50, num_parts)  \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 75, num_parts)  \
+  FLUSH_FLAT_PARAM(type_name, kind, num_cols, 100, num_parts) \
+  FLUSH_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)  \
+  FLUSH_NULL_CONSTANT_PARAM(type_name, kind, num_cols, num_parts)
+
+#define FLUSH_FOR_PARTS(type_name, kind, num_cols) \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 1)    \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 4)    \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 16)   \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 64)   \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 256)  \
+  FLUSH_FOR_NULLS(type_name, kind, num_cols, 1024)
+
+#define FLUSH_FOR_COLS(type_name, kind) \
+  FLUSH_FOR_PARTS(type_name, kind, 1)   \
+  FLUSH_FOR_PARTS(type_name, kind, 4)   \
+  FLUSH_FOR_PARTS(type_name, kind, 16)  \
+  FLUSH_FOR_PARTS(type_name, kind, 64)
+
+FLUSH_FOR_COLS(bool, BOOLEAN)
+FLUSH_FOR_COLS(int, INTEGER)
+FLUSH_FOR_COLS(bigint, BIGINT)
+FLUSH_FOR_COLS(ldec, HUGEINT)
+// clang-format on
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  PrestoVectorSerde::registerVectorSerde();
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/serializers/tests/CMakeLists.txt b/velox/serializers/tests/CMakeLists.txt
index f7f69461ef4..2d1a40275b5 100644
--- a/velox/serializers/tests/CMakeLists.txt
+++ b/velox/serializers/tests/CMakeLists.txt
@@ -36,6 +36,7 @@ target_link_libraries(
 set(
   VELOX_SERIALIZER_TEST_SOURCES
   CompactRowSerializerTest.cpp
+  PrestoIterativePartitioningSerializerTest.cpp
   PrestoOutputStreamListenerTest.cpp
   PrestoSerializerTest.cpp
   SerializedPageFileTest.cpp
@@ -51,6 +52,7 @@ set(
   velox_row_fast
   GTest::gtest
   GTest::gtest_main
+  GTest::gmock
   glog::glog
 )
 
diff --git a/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
new file mode 100644
index 00000000000..4116632f762
--- /dev/null
+++ b/velox/serializers/tests/PrestoIterativePartitioningSerializerTest.cpp
@@ -0,0 +1,1120 @@
+/*
+ * Copyright (c) International Business Machines Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <random>
+#include <string_view>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "velox/common/base/BitUtil.h"
+#include "velox/common/base/tests/GTestUtils.h"
+#include "velox/serializers/PrestoIterativePartitioningSerializer.h"
+
+#include "velox/serializers/PrestoSerializerSerializationUtils.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::serializer::presto;
+using namespace facebook::velox::test;
+
+namespace {
+
+int64_t simpleColumnPageBytes(
+    std::string_view encodingName,
+    int64_t numRows,
+    int64_t numNulls,
+    int64_t valueWidth) {
+  return serializer::presto::detail::kHeaderSize + 4 // page header + num cols
+      + 4 + static_cast<int64_t>(encodingName.size()) // column header
+      + 4 // num rows
+      + 1 + (numNulls > 0 ? bits::nbytes(numRows) : 0) // null flags
+      + (numRows - numNulls) * valueWidth; // values
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Shared base fixture
+// ---------------------------------------------------------------------------
+
+class PrestoIterativePartitioningSerializerTestBase : public VectorTestBase {
+ protected:
+  static void SetUpTestSuite() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+    if (!isRegisteredVectorSerde()) {
+      PrestoVectorSerde::registerVectorSerde();
+    }
+  }
+
+  /// Deserializes an IOBuf produced by PartitioningSerializer::flush().
+  RowVectorPtr deserialize(folly::IOBuf& iobuf, const RowTypePtr& type) {
+    auto ranges = byteRangesFromIOBuf(&iobuf);
+    BufferInputStream stream(std::move(ranges));
+    RowVectorPtr result;
+    serde_.deserialize(&stream, pool_.get(), type, &result, nullptr);
+    return result;
+  }
+
+  /// Extracts flat values from a column into a sorted vector.
+  template <typename T>
+  std::vector<T> sortedValues(const RowVectorPtr& row, int column) {
+    auto* flat = row->childAt(column)->as<FlatVector<T>>();
+    std::vector<T> vals(flat->rawValues(), flat->rawValues() + row->size());
+    std::sort(vals.begin(), vals.end());
+    return vals;
+  }
+
+  /// Extracts values from a nullable column, preserving order and nulls.
+  template <typename T>
+  std::vector<std::optional<T>> nullableValues(
+      const RowVectorPtr& row,
+      int column) {
+    auto* vec = row->childAt(column).get();
+    std::vector<std::optional<T>> result;
+    result.reserve(row->size());
+    for (int i = 0; i < row->size(); ++i) {
+      if (vec->isNullAt(i)) {
+        result.push_back(std::nullopt);
+      } else {
+        result.push_back(vec->as<FlatVector<T>>()->valueAt(i));
+      }
+    }
+    return result;
+  }
+
+  /// Builds a PrestoIterativePartitioningSerializer with default serde options.
+  std::unique_ptr<PrestoIterativePartitioningSerializer> makeSerializer(
+      const RowTypePtr& type,
+      uint32_t numPartitions) {
+    SerdeOpts opts;
+    return std::make_unique<PrestoIterativePartitioningSerializer>(
+        type,
+        numPartitions,
+        opts,
+        pool_.get(),
+        []() -> std::unique_ptr<OutputStreamListener> {
+          return std::make_unique<PrestoOutputStreamListener>();
+        });
+  }
+
+  /// Builds a serializer that computes a CRC32 checksum on each flush via a
+  /// PrestoOutputStreamListener factory, matching the kOptimized path when
+  /// OutputBufferManager has a listener factory set.
+  std::unique_ptr<PrestoIterativePartitioningSerializer>
+  makeSerializerWithListener(const RowTypePtr& type, uint32_t numPartitions) {
+    SerdeOpts opts;
+    return std::make_unique<PrestoIterativePartitioningSerializer>(
+        type,
+        numPartitions,
+        opts,
+        pool_.get(),
+        []() -> std::unique_ptr<OutputStreamListener> {
+          return std::make_unique<PrestoOutputStreamListener>();
+        });
+  }
+
+  // Presto page header layout: [numRows:4][codec:1][uncompressedSize:4]
+  //                             [compressedSize:4][checksum:8]
+  static constexpr int kCodecByteOffset = 4;
+  static constexpr int kChecksumOffset = 13;
+  static constexpr int8_t kChecksumBitMask = 4;
+
+  /// Returns the codec byte from the Presto page header in `iobuf`.
+  static int8_t codecByte(const folly::IOBuf& iobuf) {
+    VELOX_CHECK_GE(iobuf.length(), kChecksumOffset + 8);
+    return reinterpret_cast<const int8_t*>(iobuf.data())[kCodecByteOffset];
+  }
+
+  /// Returns the 8-byte checksum field from the Presto page header in `iobuf`.
+  static int64_t checksumField(const folly::IOBuf& iobuf) {
+    VELOX_CHECK_GE(iobuf.length(), kChecksumOffset + 8);
+    int64_t value;
+    std::memcpy(&value, iobuf.data() + kChecksumOffset, sizeof(value));
+    return value;
+  }
+
+  int64_t totalFlushedBytes(
+      std::map<
+          uint32_t,
+          std::pair<std::unique_ptr<folly::IOBuf>, vector_size_t>>& pages)
+      const {
+    int64_t totalBytes = 0;
+    for (const auto& [_, page] : pages) {
+      totalBytes += page.first->computeChainDataLength();
+    }
+    return totalBytes;
+  }
+
+  PrestoVectorSerde serde_;
+};
+
+template <>
+std::vector<bool> PrestoIterativePartitioningSerializerTestBase::sortedValues<
+    bool>(const RowVectorPtr& row, int column) {
+  auto* flat = row->childAt(column)->as<FlatVector<bool>>();
+  std::vector<bool> vals;
+  vals.reserve(row->size());
+  for (int i = 0; i < row->size(); ++i) {
+    vals.push_back(flat->valueAtFast(i));
+  }
+  std::sort(vals.begin(), vals.end());
+  return vals;
+}
+
+// ---------------------------------------------------------------------------
+// Value-parameterized fixture — routing, null-handling over scalar TypePtrs.
+// Uses BaseVector::create() + setNull() so no C++ type dispatch is needed.
+// ---------------------------------------------------------------------------
+
+class PrestoIterativePartitioningSerializerParamTest
+    : public ::testing::TestWithParam<TypePtr>,
+      public PrestoIterativePartitioningSerializerTestBase {
+ public:
+  static void SetUpTestSuite() {
+    PrestoIterativePartitioningSerializerTestBase::SetUpTestSuite();
+  }
+};
+
+// Short lowercase names for test output, matching the benchmark convention.
+std::string scalarTypeName(const TypePtr& type) {
+  if (type->kind() == TypeKind::BOOLEAN)
+    return "bool";
+  if (type->kind() == TypeKind::INTEGER)
+    return "int";
+  if (type->kind() == TypeKind::BIGINT)
+    return "bigint";
+  if (type->kind() == TypeKind::HUGEINT)
+    return "hugeint";
+  return type->toString();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ScalarTypes,
+    PrestoIterativePartitioningSerializerParamTest,
+    ::testing::Values(BOOLEAN(), INTEGER(), BIGINT(), HUGEINT()),
+    [](const ::testing::TestParamInfo<TypePtr>& info) {
+      return scalarTypeName(info.param);
+    });
+
+// ── Routing ──────────────────────────────────────────────────────────────────
+
+// Single append, two equal-sized partitions; also verifies rowsBuffered and
+// bytesBuffered lifecycle counters.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, basicTwoPartitions) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 6, pool_.get());
+  auto input = makeRowVector({"a"}, {col});
+
+  // Even rows → partition 0, odd rows → partition 1.
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 1, 0, 1, 0, 1});
+
+  EXPECT_EQ(serializer->rowsBuffered(), 6);
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  EXPECT_EQ(serializer->rowsBuffered(), 0);
+  EXPECT_EQ(serializer->bytesBuffered(), 0);
+
+  auto p0 = deserialize(*ioBufs.at(0).first, type);
+  auto p1 = deserialize(*ioBufs.at(1).first, type);
+
+  EXPECT_EQ(p0->size(), 3);
+  EXPECT_EQ(p1->size(), 3);
+}
+
+// All rows routed to one non-zero partition; other partitions are absent.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, allRowsToOnePartition) {
+  auto colType = GetParam();
+  auto type = ROW({"x"}, {colType});
+  auto col = BaseVector::create(colType, 5, pool_.get());
+  auto input = makeRowVector({"x"}, {col});
+
+  auto serializer = makeSerializer(type, 4);
+  serializer->append(input, {2, 2, 2, 2, 2});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 1);
+  ASSERT_TRUE(ioBufs.count(2));
+  EXPECT_EQ(deserialize(*ioBufs.at(2).first, type)->size(), 5);
+}
+
+// Single partition (numPartitions=1): all rows go to partition 0.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, singlePartition) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 5, pool_.get());
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 1);
+  serializer->append(input, std::vector<uint32_t>(5, 0));
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 1);
+  EXPECT_EQ(deserialize(*ioBufs.at(0).first, type)->size(), 5);
+}
+
+// Multiple columns of the same type: each is serialized independently by
+// flushRowChildren.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, multipleColumns) {
+  auto colType = GetParam();
+  auto type = ROW({"a", "b"}, {colType, colType});
+  auto colA = BaseVector::create(colType, 4, pool_.get());
+  auto colB = BaseVector::create(colType, 4, pool_.get());
+  auto input = makeRowVector({"a", "b"}, {colA, colB});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  EXPECT_EQ(r0->size(), 2);
+  EXPECT_EQ(r0->childAt(0)->size(), 2);
+  EXPECT_EQ(r0->childAt(1)->size(), 2);
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  EXPECT_EQ(r1->size(), 2);
+  EXPECT_EQ(r1->childAt(0)->size(), 2);
+  EXPECT_EQ(r1->childAt(1)->size(), 2);
+}
+
+// ── Null handling
+// ─────────────────────────────────────────────────────────────
+
+// Nulls appear only in one partition; the other partition is null-free.
+// Rows 0,1,2 → p0; rows 3,4 → p1. Row 1 is null.
+// p0: [not-null, null, not-null]; p1: [not-null, not-null].
+TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsInOnePartition) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 5, pool_.get());
+  col->setNull(1, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 0, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 3);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(2));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// Nulls contributed by different appends to the same partition.
+// Append 1: rows 0,1 → p0 (row 1 null); row 2 → p1.
+// Append 2: row 0 → p0 (null); row 1 → p1.
+// p0: [not-null, null, null]; p1: [not-null, not-null].
+TEST_P(
+    PrestoIterativePartitioningSerializerParamTest,
+    nullsAcrossMultipleAppends) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto serializer = makeSerializer(type, 2);
+
+  auto col1 = BaseVector::create(colType, 3, pool_.get());
+  col1->setNull(1, true);
+  serializer->append(makeRowVector({"a"}, {col1}), {0, 0, 1});
+
+  auto col2 = BaseVector::create(colType, 2, pool_.get());
+  col2->setNull(0, true);
+  serializer->append(makeRowVector({"a"}, {col2}), {0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 3);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(2));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// Partition boundary falls in the middle of a null-bitmap byte, exercising the
+// bit-extraction carry-over logic. 5 rows → p0, 4 rows → p1. The boundary at
+// bit 5 is inside the first byte of the null bitmap. Rows 1,3,5,7 are null.
+// p0: [not-null, null, not-null, null, not-null].
+// p1: [null, not-null, null, not-null].
+TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsUnalignedBoundary) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 9, pool_.get());
+  col->setNull(1, true);
+  col->setNull(3, true);
+  col->setNull(5, true);
+  col->setNull(7, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 0, 0, 0, 1, 1, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 5);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(2));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(3));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(4));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 4);
+  EXPECT_TRUE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+  EXPECT_TRUE(r1->childAt(0)->isNullAt(2));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(3));
+}
+
+// Both partitions contain nulls.
+// Input: 4 rows, rows 1 and 2 null; rows 0,1 → p0; rows 2,3 → p1.
+// p0: [not-null, null]; p1: [null, not-null].
+TEST_P(PrestoIterativePartitioningSerializerParamTest, nullsInBothPartitions) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 4, pool_.get());
+  col->setNull(1, true);
+  col->setNull(2, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 1, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 2);
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_TRUE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// All rows in one partition are null; the other partition is non-null.
+// Input: 3 rows, rows 0,1 null; rows 0,1 → p0; row 2 → p1.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, allNullsInPartition) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 3, pool_.get());
+  col->setNull(0, true);
+  col->setNull(1, true);
+  auto input = makeRowVector({"a"}, {col});
+
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(input, {0, 0, 1});
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 2);
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(0));
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(1));
+
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 1);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+}
+
+// A null batch followed by a null-free batch for the same partition.
+// Regression: bitmaps must be initialized to all-not-null so that rows from
+// the null-free batch (rawNulls == nullptr) are not decoded as null.
+TEST_P(
+    PrestoIterativePartitioningSerializerParamTest,
+    nullBatchFollowedByNullFreeBatch) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto serializer = makeSerializer(type, 2);
+
+  // Append 1: row 0 → p0 (null); row 1 → p1 (not-null).  rawNulls non-null.
+  auto col1 = BaseVector::create(colType, 2, pool_.get());
+  col1->setNull(0, true);
+  serializer->append(makeRowVector({"a"}, {col1}), {0, 1});
+
+  // Append 2: all not-null (rawNulls == nullptr).  row 0 → p0; row 1 → p1.
+  auto col2 = BaseVector::create(colType, 2, pool_.get());
+  serializer->append(makeRowVector({"a"}, {col2}), {0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  // p0: [null (append 1), not-null (append 2)]
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  ASSERT_EQ(r0->size(), 2);
+  EXPECT_TRUE(r0->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r0->childAt(0)->isNullAt(1));
+
+  // p1: [not-null (append 1), not-null (append 2)]
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  ASSERT_EQ(r1->size(), 2);
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(0));
+  EXPECT_FALSE(r1->childAt(0)->isNullAt(1));
+}
+
+// ---------------------------------------------------------------------------
+// Non-typed fixture (TEST_F) — lifecycle, structural, regression
+// ---------------------------------------------------------------------------
+
+class PrestoIterativePartitioningSerializerTest
+    : public ::testing::Test,
+      public PrestoIterativePartitioningSerializerTestBase {
+ public:
+  static void SetUpTestSuite() {
+    PrestoIterativePartitioningSerializerTestBase::SetUpTestSuite();
+  }
+};
+
+// Appending an empty RowVector produces no ioBufs on flush.
+TEST_F(PrestoIterativePartitioningSerializerTest, appendEmptyVector) {
+  auto type = ROW({"a"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(makeRowVector({"a"}, {makeFlatVector<int64_t>({})}), {});
+  EXPECT_TRUE(serializer->flush().empty());
+}
+
+// ── Lifecycle
+// ─────────────────────────────────────────────────────────────────
+
+// Multiple append() calls accumulate correctly before flush.
+TEST_F(PrestoIterativePartitioningSerializerTest, multipleAppends) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 3);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({100, 200, 300})}),
+      {0, 1, 2});
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({400, 500, 600})}),
+      {2, 0, 1});
+
+  EXPECT_EQ(serializer->rowsBuffered(), 6);
+  const auto bufferedBytes = serializer->bytesBuffered();
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 3);
+  EXPECT_EQ(bufferedBytes, totalFlushedBytes(ioBufs));
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  auto r2 = deserialize(*ioBufs.at(2).first, type);
+
+  ASSERT_EQ(r0->size(), 2);
+  ASSERT_EQ(r1->size(), 2);
+  ASSERT_EQ(r2->size(), 2);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{100, 500}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{200, 600}));
+  EXPECT_EQ(sortedValues<int64_t>(r2, 0), (std::vector<int64_t>{300, 400}));
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    bytesBufferedPartitionGrowth) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  const auto singleRowPageBytes = simpleColumnPageBytes("LONG_ARRAY", 1, 0, 8);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10})}), {0});
+  EXPECT_EQ(serializer->bytesBuffered(), singleRowPageBytes);
+
+  auto input = makeRowVector({"v"}, {makeFlatVector<int64_t>({20})});
+  EXPECT_EQ(serializer->bytesBuffered(), singleRowPageBytes);
+
+  serializer->append(input, {1});
+  const auto bytesBuffered = serializer->bytesBuffered();
+  EXPECT_EQ(serializer->bytesBuffered(), 2 * singleRowPageBytes);
+
+  auto ioBufs = serializer->flush();
+  EXPECT_EQ(serializer->bytesBuffered(), 0);
+  EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs));
+}
+
+TEST_F(PrestoIterativePartitioningSerializerTest, bytesBufferedNullFlagGrowth) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6, 7, 8})}),
+      std::vector<uint32_t>(8, 0));
+  EXPECT_EQ(
+      serializer->bytesBuffered(),
+      simpleColumnPageBytes("LONG_ARRAY", 8, 0, 8));
+
+  auto input =
+      makeRowVector({"v"}, {makeNullableFlatVector<int64_t>({std::nullopt})});
+  EXPECT_EQ(
+      serializer->bytesBuffered(),
+      simpleColumnPageBytes("LONG_ARRAY", 8, 0, 8));
+
+  serializer->append(input, {0});
+  const auto bytesBuffered = serializer->bytesBuffered();
+  EXPECT_EQ(bytesBuffered, simpleColumnPageBytes("LONG_ARRAY", 9, 1, 8));
+
+  auto ioBufs = serializer->flush();
+  EXPECT_EQ(serializer->bytesBuffered(), 0);
+  EXPECT_EQ(bytesBuffered, totalFlushedBytes(ioBufs));
+}
+
+// A flush time output mapping serializes one input colum into multiple output
+// columns.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    duplicateOutputColumnAtFlush) {
+  auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()});
+  SerdeOpts opts;
+  auto serializer = std::make_unique<PrestoIterativePartitioningSerializer>(
+      outputType, 2, opts, pool_.get(), std::vector<column_index_t>{0, 0});
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 11, 12, 13})}),
+      {0, 1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, outputType);
+  auto r1 = deserialize(*ioBufs.at(1).first, outputType);
+
+  ASSERT_EQ(r0->size(), 2);
+  ASSERT_EQ(r1->size(), 2);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{10, 12}));
+  EXPECT_EQ(sortedValues<int64_t>(r0, 1), (std::vector<int64_t>{10, 12}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{11, 13}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 1), (std::vector<int64_t>{11, 13}));
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    outputInputMappingOutOfRange) {
+  auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()});
+  SerdeOpts opts;
+  auto serializer = std::make_unique<PrestoIterativePartitioningSerializer>(
+      outputType, 2, opts, pool_.get(), std::vector<column_index_t>{0, 1});
+
+  VELOX_ASSERT_THROW(
+      serializer->append(
+          makeRowVector({"v"}, {makeFlatVector<int64_t>({10, 11})}), {0, 1}),
+      "Output column 1 maps to invalid input column 1");
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    outputInputMappingTypeMismatch) {
+  auto outputType = ROW({"v1", "v2"}, {BIGINT(), BIGINT()});
+  SerdeOpts opts;
+  auto serializer = std::make_unique<PrestoIterativePartitioningSerializer>(
+      outputType, 2, opts, pool_.get(), std::vector<column_index_t>{0, 1});
+
+  VELOX_ASSERT_THROW(
+      serializer->append(
+          makeRowVector(
+              {"v1", "v2"},
+              {
+                  makeFlatVector<int64_t>({10, 11}),
+                  makeFlatVector<int32_t>({12, 13}),
+              }),
+          {0, 1}),
+      "Output column 1 expects BIGINT, got INTEGER from input column 1");
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendExactForSinglePartition) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6, 7, 8})}),
+      std::vector<uint32_t>(8, 0));
+
+  auto input =
+      makeRowVector({"v"}, {makeNullableFlatVector<int64_t>({std::nullopt})});
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, {0});
+  EXPECT_EQ(estimatedAfter, serializer->bytesBuffered());
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendExactForConstant) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4})}),
+      std::vector<uint32_t>(4, 0));
+
+  auto input = makeRowVector({"v"}, {makeConstant<int64_t>(7, 2)});
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, std::vector<uint32_t>(2, 0));
+  EXPECT_EQ(estimatedAfter, serializer->bytesBuffered());
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendExactForNullConstant) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 1);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4, 5, 6, 7, 8})}),
+      std::vector<uint32_t>(8, 0));
+
+  auto input = makeRowVector({"v"}, {makeConstant<int64_t>(std::nullopt, 80)});
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, std::vector<uint32_t>(80, 0));
+  EXPECT_EQ(estimatedAfter, serializer->bytesBuffered());
+}
+
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    estimateBytesAfterAppendOverestimatesPartitionedAppend) {
+  auto type = ROW({"a", "b"}, {BIGINT(), INTEGER()});
+  auto serializer = makeSerializer(type, 3);
+
+  serializer->append(
+      makeRowVector(
+          {"a", "b"},
+          {
+              makeFlatVector<int64_t>({10, 20}),
+              makeFlatVector<int32_t>({100, 200}),
+          }),
+      {0, 1});
+
+  auto input = makeRowVector(
+      {"a", "b"},
+      {
+          makeNullableFlatVector<int64_t>({30, std::nullopt, 50, 60}),
+          makeNullableFlatVector<int32_t>({300, 400, std::nullopt, 600}),
+      });
+
+  // All rows land in an already non-empty partition, but
+  // estimateBytesAfterAppend still assume this input could go to the last empty
+  // partition before the real distribution is known.
+  const std::vector<uint32_t> partitions{1, 1, 1, 1};
+
+  const auto estimatedAfter = serializer->estimateBytesAfterAppend(input);
+
+  serializer->append(input, partitions);
+  EXPECT_GT(estimatedAfter, serializer->bytesBuffered());
+}
+
+// Flush twice: second flush on empty state returns an empty map.
+TEST_F(PrestoIterativePartitioningSerializerTest, flushTwice) {
+  auto type = ROW({"a"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+  serializer->append(
+      makeRowVector({"a"}, {makeFlatVector<int64_t>({10, 20})}), {0, 1});
+
+  auto ioBufs1 = serializer->flush();
+  ASSERT_EQ(ioBufs1.size(), 2);
+
+  EXPECT_TRUE(serializer->flush().empty());
+}
+
+// Append and flush multiple independent cycles.
+TEST_F(PrestoIterativePartitioningSerializerTest, multipleCycles) {
+  auto type = ROW({"a"}, {INTEGER()});
+  auto serializer = makeSerializer(type, 2);
+
+  for (int cycle = 0; cycle < 3; ++cycle) {
+    serializer->append(
+        makeRowVector(
+            {"a"}, {makeFlatVector<int32_t>({cycle * 2, cycle * 2 + 1})}),
+        {0, 1});
+    auto ioBufs = serializer->flush();
+    ASSERT_EQ(ioBufs.size(), 2) << "cycle " << cycle;
+
+    auto r0 = deserialize(*ioBufs.at(0).first, type);
+    auto r1 = deserialize(*ioBufs.at(1).first, type);
+    ASSERT_EQ(r0->size(), 1) << "cycle " << cycle;
+    ASSERT_EQ(r1->size(), 1) << "cycle " << cycle;
+    EXPECT_EQ(r0->childAt(0)->as<FlatVector<int32_t>>()->valueAt(0), cycle * 2);
+    EXPECT_EQ(
+        r1->childAt(0)->as<FlatVector<int32_t>>()->valueAt(0), cycle * 2 + 1);
+  }
+}
+
+// ── Encoding
+// ─────────────────────────────────────────────────────────────────
+
+// Constant vectors are flattened across append() calls.
+TEST_F(PrestoIterativePartitioningSerializerTest, constantColumnAcrossAppends) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 3);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(11, 4)}), {0, 1, 0, 2});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(22, 5)}), {2, 0, 1, 1, 2});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 3);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+  auto r2 = deserialize(*ioBufs.at(2).first, type);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{11, 11, 22}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{11, 22, 22}));
+  EXPECT_EQ(sortedValues<int64_t>(r2, 0), (std::vector<int64_t>{11, 22, 22}));
+}
+
+// Boolean constant vectors are flattened across append() calls.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    booleanConstantColumnAcrossAppends) {
+  auto type = ROW({"v"}, {BOOLEAN()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<bool>(true, 4)}), {0, 1, 0, 1});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<bool>(false, 3)}), {1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  EXPECT_EQ(sortedValues<bool>(r0, 0), (std::vector<bool>{false, true, true}));
+  EXPECT_EQ(
+      sortedValues<bool>(r1, 0), (std::vector<bool>{false, false, true, true}));
+}
+
+// Null constant vectors contribute only nulls but still advance row positions.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    nullConstantColumnAcrossAppends) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(std::nullopt, 3)}),
+      {0, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(7, 3)}), {1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  auto actual0 = nullableValues<int64_t>(r0, 0);
+  std::sort(actual0.begin(), actual0.end());
+  auto expected0 =
+      std::vector<std::optional<int64_t>>{std::nullopt, std::nullopt, 7};
+  EXPECT_EQ(actual0, expected0);
+
+  auto actual1 = nullableValues<int64_t>(r1, 0);
+  std::sort(actual1.begin(), actual1.end());
+  auto expected1 = std::vector<std::optional<int64_t>>{std::nullopt, 7, 7};
+  EXPECT_EQ(actual1, expected1);
+}
+
+// Constant and flat vectors are flattened and serialized correctly across
+// append() calls.
+TEST_F(PrestoIterativePartitioningSerializerTest, mixedConstantFlatVector) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(7, 3)}), {0, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3})}), {1, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(8, 2)}), {0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  EXPECT_EQ(sortedValues<int64_t>(r0, 0), (std::vector<int64_t>{3, 7, 7, 8}));
+  EXPECT_EQ(sortedValues<int64_t>(r1, 0), (std::vector<int64_t>{1, 2, 7, 8}));
+}
+
+// Null constant rows are preserved and serialized correctly with flat and
+// nullable flat vectors across append() calls.
+TEST_F(PrestoIterativePartitioningSerializerTest, mixedNullConstantFlatVector) {
+  auto type = ROW({"v"}, {BIGINT()});
+  auto serializer = makeSerializer(type, 2);
+
+  serializer->append(
+      makeRowVector({"v"}, {makeFlatVector<int64_t>({1, 2, 3, 4})}),
+      {0, 1, 1, 0});
+  serializer->append(
+      makeRowVector({"v"}, {makeConstant<int64_t>(std::nullopt, 3)}),
+      {0, 1, 0});
+  serializer->append(
+      makeRowVector(
+          {"v"}, {makeNullableFlatVector<int64_t>({std::nullopt, 7, 3})}),
+      {1, 0, 1});
+
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  auto r0 = deserialize(*ioBufs.at(0).first, type);
+  auto r1 = deserialize(*ioBufs.at(1).first, type);
+
+  auto actual0 = nullableValues<int64_t>(r0, 0);
+  std::sort(actual0.begin(), actual0.end());
+  auto expected0 =
+      std::vector<std::optional<int64_t>>{std::nullopt, std::nullopt, 1, 4, 7};
+  EXPECT_EQ(actual0, expected0);
+
+  auto actual1 = nullableValues<int64_t>(r1, 0);
+  std::sort(actual1.begin(), actual1.end());
+  auto expected1 =
+      std::vector<std::optional<int64_t>>{std::nullopt, std::nullopt, 2, 3, 3};
+  EXPECT_EQ(actual1, expected1);
+}
+
+// ── Scale and regression
+// ───────────────────────────────────────────────────────
+
+// 1024 partitions with random int64 values: verify every value reaches
+// exactly the right partition and nothing is lost or duplicated.
+TEST_F(PrestoIterativePartitioningSerializerTest, manyPartitionsRandom) {
+  constexpr uint32_t kNumPartitions = 1024;
+  constexpr int32_t kNumRows = 64'000;
+
+  std::mt19937_64 rng(42);
+  std::uniform_int_distribution<int64_t> valueDist;
+  std::uniform_int_distribution<uint32_t> partDist(0, kNumPartitions - 1);
+
+  std::vector<int64_t> inputValues(kNumRows);
+  std::vector<uint32_t> partitions(kNumRows);
+  // expected[p] holds the sorted values assigned to partition p.
+  std::vector<std::vector<int64_t>> expected(kNumPartitions);
+
+  for (int i = 0; i < kNumRows; ++i) {
+    inputValues[i] = valueDist(rng);
+    partitions[i] = partDist(rng);
+    expected[partitions[i]].push_back(inputValues[i]);
+  }
+  for (auto& v : expected) {
+    std::sort(v.begin(), v.end());
+  }
+
+  auto type = ROW({"v"}, {BIGINT()});
+  auto input = makeRowVector({"v"}, {makeFlatVector<int64_t>(inputValues)});
+
+  auto serializer = makeSerializer(type, kNumPartitions);
+  serializer->append(input, partitions);
+  auto ioBufs = serializer->flush();
+
+  // Every non-empty partition must have a page; empty partitions must not.
+  for (uint32_t p = 0; p < kNumPartitions; ++p) {
+    if (expected[p].empty()) {
+      EXPECT_EQ(ioBufs.count(p), 0) << "partition " << p;
+    } else {
+      ASSERT_EQ(ioBufs.count(p), 1) << "partition " << p;
+      auto result = deserialize(*ioBufs.at(p).first, type);
+      ASSERT_EQ(result->size(), static_cast<int32_t>(expected[p].size()))
+          << "partition " << p;
+      EXPECT_EQ(sortedValues<int64_t>(result, 0), expected[p])
+          << "partition " << p;
+    }
+  }
+}
+
+// 1024 partitions with random int64 values and ~25% nulls: verify every
+// value and null reaches exactly the right partition in input order, and
+// nothing is lost or duplicated.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    manyPartitionsRandomWithNulls) {
+  constexpr uint32_t kNumPartitions = 1024;
+  constexpr int32_t kNumRows = 64'000;
+  constexpr int32_t kNullPct = 25;
+
+  std::mt19937_64 rng(43);
+  std::uniform_int_distribution<int64_t> valueDist;
+  std::uniform_int_distribution<uint32_t> partDist(0, kNumPartitions - 1);
+  std::uniform_int_distribution<int32_t> nullDist(0, 99);
+
+  std::vector<std::optional<int64_t>> inputValues(kNumRows);
+  std::vector<uint32_t> partitions(kNumRows);
+  // expected[p] holds the sequence of (value-or-null) assigned to partition p
+  // in input order.
+  std::vector<std::vector<std::optional<int64_t>>> expected(kNumPartitions);
+
+  for (int i = 0; i < kNumRows; ++i) {
+    partitions[i] = partDist(rng);
+    if (nullDist(rng) < kNullPct) {
+      inputValues[i] = std::nullopt;
+    } else {
+      inputValues[i] = valueDist(rng);
+    }
+    expected[partitions[i]].push_back(inputValues[i]);
+  }
+
+  auto type = ROW({"v"}, {BIGINT()});
+  auto input =
+      makeRowVector({"v"}, {makeNullableFlatVector<int64_t>(inputValues)});
+
+  auto serializer = makeSerializer(type, kNumPartitions);
+  serializer->append(input, partitions);
+  auto ioBufs = serializer->flush();
+
+  // Partition rearranges values within each partition, so compare sorted.
+  // std::optional<T> sorts with nullopt < any value, preserving null count.
+  for (uint32_t p = 0; p < kNumPartitions; ++p) {
+    if (expected[p].empty()) {
+      EXPECT_EQ(ioBufs.count(p), 0) << "partition " << p;
+    } else {
+      ASSERT_EQ(ioBufs.count(p), 1) << "partition " << p;
+      auto result = deserialize(*ioBufs.at(p).first, type);
+      ASSERT_EQ(result->size(), static_cast<int32_t>(expected[p].size()))
+          << "partition " << p;
+
+      auto expectedSorted = expected[p];
+      std::sort(expectedSorted.begin(), expectedSorted.end());
+
+      auto actual = nullableValues<int64_t>(result, 0);
+      std::sort(actual.begin(), actual.end());
+
+      EXPECT_EQ(actual, expectedSorted) << "partition " << p;
+    }
+  }
+}
+
+// ── Checksum (CRC32)
+// ──────────────────────────────────────────────────────
+
+// Verify the checksum bit is set and a non-zero checksum is written when a
+// PrestoOutputStreamListener factory is provided, and that the standard
+// deserializer (which validates the checksum) accepts the page.
+TEST_P(PrestoIterativePartitioningSerializerParamTest, checksumRoundTrip) {
+  auto colType = GetParam();
+  auto type = ROW({"a"}, {colType});
+  auto col = BaseVector::create(colType, 6, pool_.get());
+  col->setNull(1, true);
+  col->setNull(4, true);
+
+  auto serializer = makeSerializerWithListener(type, 2);
+  serializer->append(makeRowVector({"a"}, {col}), {0, 1, 0, 1, 0, 1});
+  auto ioBufs = serializer->flush();
+  ASSERT_EQ(ioBufs.size(), 2);
+
+  for (auto& [partition, pageData] : ioBufs) {
+    auto& iobuf = *pageData.first;
+    EXPECT_NE(codecByte(iobuf) & kChecksumBitMask, 0)
+        << "checksum bit must be set in codec byte";
+    EXPECT_NE(checksumField(iobuf), 0) << "checksum field must be non-zero";
+    // Deserializer validates the checksum internally; throws if wrong.
+    auto result = deserialize(iobuf, type);
+    EXPECT_GT(result->size(), 0);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Non-typed fixture (TEST_F) — lifecycle, structural, regression
+// ---------------------------------------------------------------------------
+
+// Regression: flushNulls previously wrote null bitmaps by obtaining a raw
+// pointer via writePosition() then advancing the stream via seekp(). This
+// assumed the pre-allocated IOBufOutputStream had a single contiguous range,
+// but StreamArena::newRange caps each range at the size of one allocator run,
+// which can be smaller than the requested size. seekp() then failed because
+// the target position exceeded the end of the first (and only) range.
+//
+// Reproducing condition: 16 columns × 10'000 rows × 50% nulls in one
+// partition generates enough output (~100 KB) to trigger the run-size cap.
+TEST_F(
+    PrestoIterativePartitioningSerializerTest,
+    flushNullsBitmapManyColumnsLargeRowCount) {
+  constexpr int32_t kNumCols = 16;
+  constexpr int32_t kNumRows = 10'000;
+
+  std::vector<std::string> names;
+  std::vector<VectorPtr> children;
+  names.reserve(kNumCols);
+  children.reserve(kNumCols);
+
+  for (int col = 0; col < kNumCols; ++col) {
+    names.push_back(fmt::format("c{}", col));
+    // Rows where (row % 2 == 0) are null; the rest hold (row * kNumCols + col).
+    children.push_back(
+        makeFlatVector<int64_t>(
+            kNumRows,
+            [col](auto row) {
+              return static_cast<int64_t>(row * kNumCols + col);
+            },
+            [](auto row) { return (row % 2) == 0; }));
+  }
+
+  auto input = makeRowVector(names, children);
+  auto rowType = std::static_pointer_cast<const RowType>(input->type());
+
+  auto serializer = makeSerializer(rowType, 1);
+  serializer->append(input, std::vector<uint32_t>(kNumRows, 0));
+  auto ioBufs = serializer->flush();
+
+  ASSERT_EQ(ioBufs.size(), 1);
+
+  auto result = deserialize(*ioBufs.at(0).first, rowType);
+  ASSERT_EQ(result->size(), kNumRows);
+
+  for (int col = 0; col < kNumCols; ++col) {
+    auto* flat = result->childAt(col)->as<FlatVector<int64_t>>();
+    for (int row = 0; row < kNumRows; ++row) {
+      if ((row % 2) == 0) {
+        EXPECT_TRUE(result->childAt(col)->isNullAt(row))
+            << "col=" << col << " row=" << row;
+      } else {
+        ASSERT_FALSE(result->childAt(col)->isNullAt(row))
+            << "col=" << col << " row=" << row;
+        EXPECT_EQ(
+            flat->valueAt(row), static_cast<int64_t>(row * kNumCols + col))
+            << "col=" << col << " row=" << row;
+      }
+    }
+  }
+}
diff --git a/velox/vector/CMakeLists.txt b/velox/vector/CMakeLists.txt
index 9fd4f2ca9ea..6f76bc9bfb9 100644
--- a/velox/vector/CMakeLists.txt
+++ b/velox/vector/CMakeLists.txt
@@ -22,6 +22,7 @@ velox_add_library(
   FlatVector.cpp
   LazyVector.cpp
   MapConcat.cpp
+  PartitionedVector.cpp
   SelectivityVector.cpp
   SequenceVector.cpp
   SimpleVector.cpp
diff --git a/velox/vector/PartitionedVector.cpp b/velox/vector/PartitionedVector.cpp
new file mode 100644
index 00000000000..233c932fee9
--- /dev/null
+++ b/velox/vector/PartitionedVector.cpp
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/vector/PartitionedVector.h"
+
+#include "velox/vector/FlatVector.h"
+
+namespace facebook::velox {
+
+using Byte = uint8_t;
+using BitIndex = uint8_t;
+
+namespace {
+
+inline void countPartitionSizes(
+    const std::vector<uint32_t>& partitions,
+    vector_size_t* rowCounts) {
+  VELOX_DCHECK_NOT_NULL(rowCounts);
+
+  for (vector_size_t i = 0; i < partitions.size(); i++) {
+    rowCounts[partitions[i]]++;
+  }
+}
+
+inline void prefixSum(vector_size_t* offsets, uint32_t numPartitions) {
+  for (uint32_t i = 1; i < numPartitions; i++) {
+    offsets[i] += offsets[i - 1];
+  }
+}
+
+inline void calculateOffsets(
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    vector_size_t* endPartitionOffsets) {
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+
+  if (numPartitions > 1) {
+    std::fill_n(endPartitionOffsets, numPartitions, 0);
+    countPartitionSizes(partitions, endPartitionOffsets);
+    prefixSum(endPartitionOffsets, numPartitions);
+  } else {
+    endPartitionOffsets[0] = static_cast<vector_size_t>(partitions.size());
+  }
+}
+
+// endPartitionOffsets is an array of length numPartitions where each entry i is
+// the exclusive end position of partition i. cursorPartitionOffsets is
+// initialized such that cursorPartitionOffsets[0] = 0 and for i>0,
+// cursorPartitionOffsets[i] = endPartitionOffsets[i-1], i.e., the inclusive
+// begin positions.
+void initializeCursorPartitionOffsets(
+    BufferPtr& cursorPartitionOffsets,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+  VELOX_DCHECK_EQ(
+      endPartitionOffsets->size(), numPartitions * sizeof(vector_size_t));
+
+  ensureCapacity<vector_size_t>(cursorPartitionOffsets, numPartitions, pool);
+  cursorPartitionOffsets->asMutable<vector_size_t>()[0] = 0;
+  std::memcpy(
+      &cursorPartitionOffsets->asMutable<vector_size_t>()[1],
+      endPartitionOffsets->as<vector_size_t>(),
+      sizeof(vector_size_t) * (numPartitions - 1));
+  cursorPartitionOffsets->setSize(numPartitions * sizeof(vector_size_t));
+}
+
+// In-place partitioning algorithm for fixed-width values
+// This algorithm rearranges elements so that each element ends up in its target
+// partition by repeatedly swapping elements until the current element belongs
+// to the current partition
+template <typename T>
+void partitionFixedWidthValuesInPlace(
+    T* values,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(values);
+  VELOX_DCHECK_NOT_NULL(endPartitionOffsets);
+  initializeCursorPartitionOffsets(
+      ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
+  auto* rawCursorOffsets =
+      ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
+  const auto* rawEndOffsets = endPartitionOffsets->as<vector_size_t>();
+
+  for (auto currentPartition = 0; currentPartition < numPartitions;
+       currentPartition++) {
+    auto& offset = rawCursorOffsets[currentPartition];
+    auto endOffset = rawEndOffsets[currentPartition];
+
+    while (offset < endOffset) {
+      uint32_t targetPartition = partitions[offset];
+
+      while (targetPartition != currentPartition) {
+        auto destinationOffset = rawCursorOffsets[targetPartition]++;
+        std::swap(values[destinationOffset], values[offset]);
+        targetPartition = partitions[destinationOffset];
+      }
+      offset = ++rawCursorOffsets[currentPartition];
+    }
+  }
+}
+
+// Swap two bits between two bytes
+void swapBit(Byte& byte1, BitIndex bit1, Byte& byte2, BitIndex bit2) {
+  // Calculate the difference between the bits
+  char bitDiff = ((byte1 >> bit1) & 1) ^ ((byte2 >> bit2) & 1);
+
+  // Apply the difference to toggle the bits
+  byte1 ^= (bitDiff << bit1);
+  byte2 ^= (bitDiff << bit2);
+}
+
+void partitionBitsInPlace(
+    Byte* bits,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    const BufferPtr& endPartitionOffsets,
+    velox::memory::MemoryPool* pool) {
+  initializeCursorPartitionOffsets(
+      ctx.cursorPartitionOffsets, endPartitionOffsets, numPartitions, pool);
+
+  auto* rawCursorOffsets =
+      ctx.cursorPartitionOffsets->asMutable<vector_size_t>();
+  const auto* rawEndOffsets = endPartitionOffsets->as<vector_size_t>();
+
+  for (uint32_t partition = 0; partition < numPartitions; partition++) {
+    auto& offset = rawCursorOffsets[partition];
+    auto endOffset = rawEndOffsets[partition];
+    while (offset < endOffset) {
+      uint32_t p = partitions[offset];
+      while (p != partition) {
+        vector_size_t destinationOffset = rawCursorOffsets[p]++;
+
+        // Calculate the byte address and bit index within the byte for the
+        // source and destination bits. Since each byte contains 8 bits, we
+        // divide the offset by 8 to get the byte address and take the modulus
+        // by 8 to get the bit index within that byte.
+        vector_size_t destinationAddr = destinationOffset >> 3;
+        int8_t destinationBitInByte = destinationOffset & 7;
+        vector_size_t fromAddr = offset >> 3;
+        int8_t fromBitInByte = offset & 7;
+
+        swapBit(
+            bits[destinationAddr],
+            destinationBitInByte,
+            bits[fromAddr],
+            fromBitInByte);
+        p = partitions[destinationOffset];
+      }
+      offset = ++rawCursorOffsets[partition];
+    }
+  }
+}
+
+template <typename T>
+void partitionFixedWidthValues(
+    BufferPtr& inputBuffer,
+    const std::vector<uint32_t>& partitions,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(inputBuffer);
+
+  auto input = inputBuffer->asMutable<T>();
+  partitionFixedWidthValuesInPlace<T>(
+      input, partitions, numPartitions, endPartitionOffsets, ctx, pool);
+}
+
+template <>
+void partitionFixedWidthValues<bool>(
+    BufferPtr& inputBuffer,
+    const std::vector<uint32_t>& partitions,
+    const BufferPtr& endPartitionOffsets,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_DCHECK_NOT_NULL(inputBuffer);
+
+  auto input = inputBuffer->asMutable<Byte>();
+  partitionBitsInPlace(
+      input, partitions, numPartitions, ctx, endPartitionOffsets, pool);
+}
+
+template <TypeKind typeKind>
+PartitionedVectorPtr createPartitionedFlatVector(
+    VectorPtr vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  using T = typename TypeTraits<typeKind>::NativeType;
+  auto flatVector = std::dynamic_pointer_cast<FlatVector<T>>(vector);
+  VELOX_CHECK_NOT_NULL(flatVector);
+
+  auto partitionedFlatVector = std::make_shared<PartitionedFlatVector<T>>(
+      flatVector, numPartitions, endPartitionOffsets, pool);
+
+  // Always call partition() so that numNullsPerPartition_ is populated,
+  // even when numPartitions == 1 and no data movement is required.
+  partitionedFlatVector->partition(partitions, ctx);
+
+  return partitionedFlatVector;
+}
+
+PartitionedVectorPtr createPartitionedRowVector(
+    VectorPtr vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  VELOX_CHECK_NOT_NULL(rowVector);
+
+  auto partitionedRowVector = std::make_shared<PartitionedRowVector>(
+      rowVector, numPartitions, endPartitionOffsets, pool);
+
+  // Always call partition() to initialize partitionedChildren_, even when
+  // numPartitions == 1, so that partitionAt() can reconstruct the RowVector.
+  partitionedRowVector->partition(partitions, ctx);
+
+  return partitionedRowVector;
+}
+
+} // namespace
+
+PartitionedVector::~PartitionedVector() = default;
+
+PartitionedVectorPtr PartitionedVector::create(
+    const VectorPtr& vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_CHECK_NOT_NULL(vector);
+  VELOX_CHECK_EQ(vector->size(), partitions.size());
+  VELOX_CHECK_GT(numPartitions, 0);
+  VELOX_CHECK_NOT_NULL(pool);
+
+  // Calculate the end offsets for each partition. For example, if there are 3
+  // partitions with 2, 3, and 1 rows respectively, then endPartitionOffsets[0]
+  // = 2, endPartitionOffsets[1] = 5, and endPartitionOffsets[2] = 6.
+  BufferPtr endPartitionOffsets;
+  ensureCapacity<vector_size_t>(endPartitionOffsets, numPartitions, pool);
+  calculateOffsets(
+      partitions,
+      numPartitions,
+      endPartitionOffsets->asMutable<vector_size_t>());
+  endPartitionOffsets->setSize(numPartitions * sizeof(vector_size_t));
+
+  auto raw = endPartitionOffsets->as<vector_size_t>();
+  VELOX_DCHECK_EQ(raw[numPartitions - 1], partitions.size());
+
+  return create(
+      vector, partitions, numPartitions, endPartitionOffsets, ctx, pool);
+}
+
+PartitionedVectorPtr PartitionedVector::create(
+    const VectorPtr& vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions,
+    const BufferPtr& endPartitionOffsets,
+    PartitionBuildContext& ctx,
+    velox::memory::MemoryPool* pool) {
+  VELOX_CHECK_NOT_NULL(endPartitionOffsets);
+  VELOX_CHECK_EQ(
+      endPartitionOffsets->size(), numPartitions * sizeof(vector_size_t));
+
+  auto encoding = vector->encoding();
+  auto typeKind = vector->typeKind();
+
+  switch (encoding) {
+    case VectorEncoding::Simple::FLAT: {
+      auto partitionedFlatVector = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+          createPartitionedFlatVector,
+          typeKind,
+          vector,
+          partitions,
+          numPartitions,
+          endPartitionOffsets,
+          ctx,
+          pool);
+      return partitionedFlatVector;
+    }
+
+    case VectorEncoding::Simple::ROW: {
+      return createPartitionedRowVector(
+          vector, partitions, numPartitions, endPartitionOffsets, ctx, pool);
+    }
+
+    case VectorEncoding::Simple::CONSTANT: {
+      auto partitionedConstantVector =
+          std::make_shared<PartitionedConstantVector>(
+              vector, numPartitions, endPartitionOffsets, pool);
+      partitionedConstantVector->partition(partitions, ctx);
+      return partitionedConstantVector;
+    }
+
+    case VectorEncoding::Simple::ARRAY:
+    case VectorEncoding::Simple::MAP:
+    case VectorEncoding::Simple::DICTIONARY:
+    case VectorEncoding::Simple::BIASED:
+    case VectorEncoding::Simple::SEQUENCE:
+    case VectorEncoding::Simple::LAZY:
+      VELOX_UNSUPPORTED(
+          "Unsupported vector encoding for PartitionedVector: {}",
+          mapSimpleToName(encoding));
+    default:
+      VELOX_UNREACHABLE(
+          "Invalid vector encoding for PartitionedVector: {}", encoding);
+  }
+}
+
+VectorPtr PartitionedVector::baseVector() const {
+  return vector_;
+}
+
+std::string PartitionedVector::toString() const {
+  std::string offsets;
+  for (vector_size_t i = 0; i < numPartitions_; ++i) {
+    if (i > 0) {
+      offsets += ',';
+    }
+    offsets += fmt::format("{}", rawEndPartitionOffsets_[i]);
+  }
+
+  return fmt::format(
+      "PartitionedVector[numPartitions: {}, offsets: {}]",
+      numPartitions_,
+      offsets);
+}
+
+template <typename T>
+void PartitionedFlatVector<T>::partition(
+    const std::vector<uint32_t>& partitions,
+    PartitionBuildContext& ctx) {
+  if (vector_->rawNulls()) {
+    Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
+    partitionBitsInPlace(
+        rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
+  }
+
+  auto valuesBuffer = vector_->as<FlatVector<T>>()->values();
+  partitionFixedWidthValues<T>(
+      valuesBuffer,
+      partitions,
+      endPartitionOffsets_,
+      numPartitions_,
+      ctx,
+      pool_);
+
+  // Count nulls per partition from the now-partitioned null bitmap.
+  if (const uint64_t* rawNulls = vector_->rawNulls()) {
+    for (uint32_t p = 0; p < numPartitions_; ++p) {
+      const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1];
+      const vector_size_t end = rawEndPartitionOffsets_[p];
+      if (begin < end) {
+        numNullsPerPartition_[p] =
+            static_cast<vector_size_t>(bits::countNulls(rawNulls, begin, end));
+      }
+    }
+  }
+}
+
+template <typename T>
+VectorPtr PartitionedFlatVector<T>::partitionAt(uint32_t partition) const {
+  VELOX_CHECK_LT(partition, numPartitions_);
+
+  vector_size_t beginOffset =
+      partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+  vector_size_t numRowsInPartition =
+      rawEndPartitionOffsets_[partition] - beginOffset;
+
+  return vector_->slice(beginOffset, numRowsInPartition);
+}
+
+void PartitionedRowVector::partition(
+    const std::vector<uint32_t>& partitions,
+    PartitionBuildContext& ctx) {
+  auto* rowVector = vector_->as<RowVector>();
+  partitionedChildren_.reserve(rowVector->childrenSize());
+
+  for (const auto& child : rowVector->children()) {
+    partitionedChildren_.push_back(
+        PartitionedVector::create(
+            child,
+            partitions,
+            numPartitions_,
+            endPartitionOffsets_,
+            ctx,
+            pool_));
+  }
+
+  if (numPartitions_ > 1 && vector_->rawNulls()) {
+    Byte* rawNulls = reinterpret_cast<Byte*>(vector_->mutableRawNulls());
+    partitionBitsInPlace(
+        rawNulls, partitions, numPartitions_, ctx, endPartitionOffsets_, pool_);
+  }
+
+  // Count nulls per partition from the now-partitioned null bitmap.
+  if (const uint64_t* rawNulls = vector_->rawNulls()) {
+    for (uint32_t p = 0; p < numPartitions_; ++p) {
+      const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1];
+      const vector_size_t end = rawEndPartitionOffsets_[p];
+      if (begin < end) {
+        numNullsPerPartition_[p] =
+            static_cast<vector_size_t>(bits::countNulls(rawNulls, begin, end));
+      }
+    }
+  }
+}
+
+VectorPtr PartitionedRowVector::partitionAt(uint32_t partition) const {
+  VELOX_CHECK_LT(partition, numPartitions_);
+
+  vector_size_t beginOffset =
+      partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+  vector_size_t numRowsInPartition =
+      rawEndPartitionOffsets_[partition] - beginOffset;
+
+  std::vector<VectorPtr> children;
+  children.reserve(partitionedChildren_.size());
+  for (const auto& child : partitionedChildren_) {
+    children.push_back(child->partitionAt(partition));
+  }
+
+  BufferPtr nulls = nullptr;
+  if (numRowsInPartition > 0 && vector_->rawNulls()) {
+    nulls = AlignedBuffer::allocate<bool>(numRowsInPartition, pool_);
+    bits::copyBits(
+        vector_->rawNulls(),
+        beginOffset,
+        nulls->asMutable<uint64_t>(),
+        0,
+        numRowsInPartition);
+  }
+
+  return std::make_shared<RowVector>(
+      pool_,
+      vector_->type(),
+      std::move(nulls),
+      numRowsInPartition,
+      std::move(children));
+}
+
+void PartitionedConstantVector::partition(
+    const std::vector<uint32_t>& /*partitions*/,
+    PartitionBuildContext& /*ctx*/) {
+  if (!vector_->isNullAt(0)) {
+    return;
+  }
+
+  for (uint32_t p = 0; p < numPartitions_; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawEndPartitionOffsets_[p - 1];
+    const vector_size_t end = rawEndPartitionOffsets_[p];
+    if (begin < end) {
+      numNullsPerPartition_[p] = end - begin;
+    }
+  }
+}
+
+VectorPtr PartitionedConstantVector::partitionAt(uint32_t partition) const {
+  VELOX_CHECK_LT(partition, numPartitions_);
+
+  const vector_size_t beginOffset =
+      partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+  const vector_size_t numRowsInPartition =
+      rawEndPartitionOffsets_[partition] - beginOffset;
+
+  return vector_->slice(0, numRowsInPartition);
+}
+
+} // namespace facebook::velox
diff --git a/velox/vector/PartitionedVector.h b/velox/vector/PartitionedVector.h
new file mode 100644
index 00000000000..24dec3f03fb
--- /dev/null
+++ b/velox/vector/PartitionedVector.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <vector>
+
+#include "velox/vector/BaseVector.h"
+#include "velox/vector/ComplexVector.h"
+
+namespace facebook::velox {
+
+class PartitionedVector;
+using PartitionedVectorPtr = std::shared_ptr<PartitionedVector>;
+
+namespace {
+
+// TODO: This was copied from dwio::common::BufferUtil.h. However the vector
+// module should not depend on dwio. Move this to a common place
+template <typename T>
+inline void ensureCapacity(
+    BufferPtr& data,
+    size_t numElements,
+    velox::memory::MemoryPool* pool,
+    bool preserveOldData = false,
+    bool clearBits = false) {
+  size_t oldSize = 0;
+  size_t newCapacity = BaseVector::byteSize<T>(numElements);
+  if (!data) {
+    data = AlignedBuffer::allocate<T>(numElements, pool);
+  } else {
+    oldSize = data->size();
+    if (!data->isMutable() || data->capacity() < newCapacity) {
+      auto newData = AlignedBuffer::allocate<T>(numElements, pool);
+      if (preserveOldData) {
+        std::memcpy(
+            newData->template asMutable<uint8_t>(),
+            data->as<uint8_t>(),
+            oldSize);
+      }
+      data = newData;
+    }
+  }
+
+  if (clearBits && newCapacity > oldSize) {
+    std::memset(
+        (void*)(data->asMutable<int8_t>() + oldSize),
+        0L,
+        newCapacity - oldSize);
+  }
+}
+
+} // namespace
+
+/// Construction-time context used to build a PartitionedVector.
+///
+/// This struct contains only transient execution context needed during
+/// construction. None of the fields here define the logical state of
+/// PartitionedVector and none are retained after create().
+/// All fields are only valid during the PartitionedVector::create() call.
+struct PartitionBuildContext {
+  BufferPtr cursorPartitionOffsets = nullptr;
+
+  PartitionBuildContext() = default;
+};
+
+/// PartitionedVector provides an in-place, partition-aware layout of a vector
+/// based on per-row partition IDs.
+///
+/// This is a low-level execution abstraction, analogous to DecodedVector:
+/// - it owns partitioning metadata (offsets, indices)
+/// - it does not encode operator-specific semantics
+/// - it is intended to be reused by multiple exec components
+///   (aggregation, sorting, shuffle, etc.)
+///
+/// The partitioning operation rearranges rows so that rows belonging to the
+/// same partition occupy a contiguous range.
+///
+/// Thread-safety:
+///   This class is NOT thread-safe. All methods must be called from a single
+///   thread. Internal buffers are mutated during create().
+class PartitionedVector {
+ public:
+  /// Disable default constructor.
+  PartitionedVector() = delete;
+
+  /// Disable copy constructor and assignment.
+  PartitionedVector(const PartitionedVector& other) = delete;
+  PartitionedVector& operator=(const PartitionedVector& other) = delete;
+
+  // Use default move constructor and move assignment operator.
+  PartitionedVector(PartitionedVector&&) noexcept = default;
+  PartitionedVector& operator=(PartitionedVector&&) noexcept = default;
+
+  /// Virtual destructor.
+  virtual ~PartitionedVector();
+
+  /// Factory method to create a PartitionedVector. This is the main entry point
+  /// for constructing a PartitionedVector. The partitioning operation
+  /// rearranges rows in the base vector so that rows belonging to the same
+  /// partition occupy a contiguous range.
+  ///
+  /// Params:
+  /// - vector: the base vector to be partitioned. This is modified during
+  ///   partitioning, and becomes the underlying vector of the created
+  ///   PartitionedVector.
+  /// - partitions: a vector of partition IDs for each row in the base vector.
+  ///   The length of this vector must be the same as the number of rows in the
+  ///   base vector. Each entry must be a value between 0 and numPartitions - 1.
+  /// - numPartitions: the total number of partitions. This must be greater than
+  ///   0.
+  /// - ctx: the context object for building the partitioned vector. This
+  ///   contains transient execution context needed during construction, such as
+  ///   intermediate buffers. None of the fields in this context define the
+  ///   logical state of the PartitionedVector, and none are retained after
+  ///   create(). All fields in this context are only valid during the create()
+  ///   call.
+  /// - pool: the memory pool for allocating any necessary buffers during the
+  ///   creation of the PartitionedVector.
+  static PartitionedVectorPtr create(
+      const VectorPtr& vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions,
+      PartitionBuildContext& ctx,
+      velox::memory::MemoryPool* pool);
+
+  /// Returns the underlying vector.
+  VectorPtr baseVector() const;
+
+  /// Returns the partitioned vector at partition p. If the number of rows in
+  /// that partition is 0, returns an empty vector.
+  virtual VectorPtr partitionAt(uint32_t partition) const = 0;
+
+  template <typename T>
+  T* as() {
+    static_assert(std::is_base_of_v<PartitionedVector, T>);
+    return dynamic_cast<T*>(this);
+  }
+
+  /// Returns the number of null rows in the given partition.
+  vector_size_t numNullsAt(uint32_t partition) const {
+    VELOX_DCHECK_LT(partition, numPartitions_);
+    return numNullsPerPartition_[partition];
+  }
+
+  vector_size_t numRowsAt(uint32_t partition) const {
+    auto beginOffset =
+        partition == 0 ? 0 : rawEndPartitionOffsets_[partition - 1];
+    auto endOffset = rawEndPartitionOffsets_[partition];
+    return endOffset - beginOffset;
+  }
+
+  TypeKind typeKind() const {
+    return vector_->typeKind();
+  }
+
+  vector_size_t* rawPartitionOffsets() {
+    return rawEndPartitionOffsets_;
+  }
+
+  virtual const vector_size_t* rawSizes() = 0;
+
+  /// Returns string representation of the value in the specified row.
+  virtual std::string toString() const;
+
+ protected:
+  // Internal create method that accepts pre-computed endPartitionOffsets
+  // buffer.
+  static PartitionedVectorPtr create(
+      const VectorPtr& vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsetsBuffer,
+      PartitionBuildContext& ctx,
+      velox::memory::MemoryPool* pool);
+
+  PartitionedVector(
+      const VectorPtr& vector,
+      uint32_t numPartitions,
+      const BufferPtr& endPartitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : vector_(vector),
+        numPartitions_(numPartitions),
+        endPartitionOffsets_(endPartitionOffsets),
+        numNullsPerPartition_(numPartitions, 0),
+        pool_(pool) {
+    VELOX_CHECK_NOT_NULL(vector_);
+    VELOX_CHECK_GT(numPartitions_, 0);
+    VELOX_CHECK_NOT_NULL(endPartitionOffsets_);
+    VELOX_CHECK_EQ(
+        endPartitionOffsets_->size(), numPartitions_ * sizeof(vector_size_t));
+    VELOX_CHECK_NOT_NULL(pool_);
+
+    rawEndPartitionOffsets_ = endPartitionOffsets_->asMutable<vector_size_t>();
+  }
+
+  virtual void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) = 0;
+
+  // The base vector that is being partitioned. This is modified during
+  // partitioning.
+  VectorPtr vector_;
+
+  // Total number of partitions. This is set at construction and does not change
+  // during partitioning. It doesn't have const quantifier because we want to
+  // allow move assignment operator.
+  uint32_t numPartitions_;
+
+  // The cumulative end row offsets for each partition. For example, if there
+  // are 3 partitions with 2, 3, and 1 rows respectively, then
+  // endPartitionOffsets_[0] = 2, endPartitionOffsets_[1] = 5, and
+  // endPartitionOffsets_[2] = 6.
+  BufferPtr endPartitionOffsets_;
+
+  // The raw pointer to the endPartitionOffsets_ buffer for easy access during
+  // partitioning.
+  vector_size_t* rawEndPartitionOffsets_;
+
+  /// Null row counts per partition, computed during partition().
+  std::vector<vector_size_t> numNullsPerPartition_;
+
+  velox::memory::MemoryPool* pool_;
+};
+
+using PartitionedVectorPtr = std::shared_ptr<PartitionedVector>;
+
+template <typename T>
+class PartitionedFlatVector : public PartitionedVector {
+ public:
+  PartitionedFlatVector(
+      const VectorPtr& flatVector,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : PartitionedVector(flatVector, numPartitions, partitionOffsets, pool) {}
+
+  void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) override;
+
+  VectorPtr partitionAt(uint32_t partition) const override;
+
+  const vector_size_t* rawSizes() override {
+    VELOX_UNREACHABLE("PartitionedFlatVector does not implement rawSizes()");
+  }
+};
+
+/// Partitions a RowVector in-place so that rows belonging to the same
+/// partition occupy a contiguous range. Recursively partitions each child
+/// column using PartitionedVector.
+class PartitionedRowVector : public PartitionedVector {
+ public:
+  PartitionedRowVector(
+      const VectorPtr& rowVector,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : PartitionedVector(rowVector, numPartitions, partitionOffsets, pool) {}
+
+  void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) override;
+
+  VectorPtr partitionAt(uint32_t partition) const override;
+
+  /// Returns the partitioned child vector at the given column index.
+  PartitionedVectorPtr childAt(uint32_t col) const {
+    VELOX_DCHECK_LT(col, partitionedChildren_.size());
+    return partitionedChildren_[col];
+  }
+
+  const vector_size_t* rawSizes() override {
+    VELOX_UNREACHABLE("PartitionedRowVector does not implement rawSizes()");
+  }
+
+ private:
+  /// Partitioned child columns, one per child of the underlying RowVector.
+  std::vector<PartitionedVectorPtr> partitionedChildren_;
+};
+
+/// Partitions a ConstantVector by reusing the same constant payload and
+/// returning constant slices sized to each partition.
+class PartitionedConstantVector : public PartitionedVector {
+ public:
+  PartitionedConstantVector(
+      const VectorPtr& constantVector,
+      uint32_t numPartitions,
+      const BufferPtr& partitionOffsets,
+      velox::memory::MemoryPool* pool)
+      : PartitionedVector(
+            constantVector,
+            numPartitions,
+            partitionOffsets,
+            pool) {}
+
+  void partition(
+      const std::vector<uint32_t>& partitions,
+      PartitionBuildContext& ctx) override;
+
+  VectorPtr partitionAt(uint32_t partition) const override;
+
+  const vector_size_t* rawSizes() override {
+    VELOX_UNREACHABLE(
+        "PartitionedConstantVector does not implement rawSizes()");
+  }
+};
+
+} // namespace facebook::velox
diff --git a/velox/vector/benchmarks/CMakeLists.txt b/velox/vector/benchmarks/CMakeLists.txt
index 0cb3c78bfd8..8c1840daa1b 100644
--- a/velox/vector/benchmarks/CMakeLists.txt
+++ b/velox/vector/benchmarks/CMakeLists.txt
@@ -45,3 +45,13 @@ target_link_libraries(
   gflags::gflags
   glog::glog
 )
+
+add_executable(velox_vector_partitioned_vector_benchmark PartitionedVectorBenchmark.cpp)
+target_link_libraries(
+  velox_vector_partitioned_vector_benchmark
+  velox_dwio_common_test_utils
+  velox_vector
+  velox_vector_test_lib
+  Folly::folly
+  Folly::follybenchmark
+)
diff --git a/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
new file mode 100644
index 00000000000..8589bbec0a0
--- /dev/null
+++ b/velox/vector/benchmarks/PartitionedVectorBenchmark.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/vector/PartitionedVector.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+// Add the following definitions to allow Clion runs
+DEFINE_bool(gtest_color, false, "");
+DEFINE_string(gtest_filter, "*", "");
+
+using namespace facebook::velox;
+using namespace facebook::velox::test;
+
+namespace facebook::velox::test {
+
+namespace {
+
+thread_local auto gen = std::mt19937(42);
+
+const std::function<bool(vector_size_t)> noNulls;
+
+auto allNulls = [](vector_size_t) { return true; };
+
+auto halfNulls = [](vector_size_t row) { return row % 2 == 0; };
+
+template <TypeKind T>
+RowTypePtr scalarTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, createScalarType<T>()));
+}
+
+RowTypePtr dateTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, DATE()));
+}
+
+RowTypePtr shortDecimalTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, DECIMAL(10, 2)));
+}
+
+RowTypePtr longDecimalTypeGenerator(int32_t numColumns) {
+  return ROW(std::vector<TypePtr>(numColumns, DECIMAL(20, 3)));
+}
+
+RowTypePtr mixedFlatTypeGenerator(int32_t numColumns) {
+  const std::vector<TypePtr> typeSelection = {
+      BOOLEAN(),
+      TINYINT(),
+      SMALLINT(),
+      INTEGER(),
+      BIGINT(),
+      HUGEINT(),
+      REAL(),
+      DOUBLE(),
+      TIMESTAMP(),
+      DATE(),
+      DECIMAL(10, 2),
+      DECIMAL(20, 3),
+  };
+
+  std::vector<TypePtr> types;
+  types.reserve(numColumns);
+
+  for (int i = 0; i < numColumns; ++i) {
+    types.push_back(typeSelection[i % typeSelection.size()]);
+  }
+
+  std::ranges::shuffle(types, gen);
+
+  return ROW(std::move(types));
+}
+
+auto randomPartitionFunction = [](const RowVectorPtr& vector,
+                                  uint32_t numPartitions,
+                                  std::vector<uint32_t>& partitions) {
+  partitions.resize(vector->size());
+  for (int i = 0; i < vector->size(); ++i) {
+    partitions[i] = gen() % numPartitions;
+  }
+};
+
+/// Builds benchmark row vectors, one column at a time.
+class VectorBuilder : public VectorTestBase {
+ public:
+  RowVectorPtr makeRowVector(
+      const RowTypePtr& rowType,
+      vector_size_t numRows,
+      const std::function<bool(vector_size_t)>& isNullAt) {
+    std::vector<VectorPtr> children;
+    children.reserve(rowType->size());
+    for (auto i = 0; i < rowType->size(); ++i) {
+      children.push_back(makeColumn(rowType->childAt(i), numRows, isNullAt));
+    }
+    return VectorTestBase::makeRowVector(children);
+  }
+
+ private:
+  VectorPtr makeColumn(
+      const TypePtr& type,
+      vector_size_t size,
+      const std::function<bool(vector_size_t)>& isNullAt) {
+    switch (type->kind()) {
+      case TypeKind::BOOLEAN:
+        return makeFlatVector<bool>(
+            size, [](auto row) { return row % 2 == 0; }, isNullAt, type);
+      case TypeKind::TINYINT:
+        return makeFlatVector<int8_t>(
+            size,
+            [](auto row) { return static_cast<int8_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::SMALLINT:
+        return makeFlatVector<int16_t>(
+            size,
+            [](auto row) { return static_cast<int16_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::INTEGER:
+        if (type->isDate()) {
+          return makeFlatVector<int32_t>(
+              size,
+              [](auto row) { return static_cast<int32_t>(row); },
+              isNullAt,
+              type);
+        }
+        return makeFlatVector<int32_t>(
+            size, [](auto row) { return row; }, isNullAt, type);
+      case TypeKind::BIGINT:
+        return makeFlatVector<int64_t>(
+            size,
+            [](auto row) { return static_cast<int64_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::HUGEINT:
+        return makeFlatVector<int128_t>(
+            size,
+            [](auto row) { return static_cast<int128_t>(row); },
+            isNullAt,
+            type);
+      case TypeKind::REAL:
+        return makeFlatVector<float>(
+            size,
+            [](auto row) { return static_cast<float>(row); },
+            isNullAt,
+            type);
+      case TypeKind::DOUBLE:
+        return makeFlatVector<double>(
+            size,
+            [](auto row) { return static_cast<double>(row); },
+            isNullAt,
+            type);
+      case TypeKind::TIMESTAMP:
+        return makeFlatVector<Timestamp>(
+            size,
+            [](auto row) { return Timestamp(row, row * 1'000); },
+            isNullAt,
+            type);
+      case TypeKind::VARCHAR:
+      case TypeKind::VARBINARY:
+        // Alternate between short inlined strings (≤12 bytes) and long
+        // out-of-line strings (>12 bytes) to exercise both StringView paths.
+        return makeFlatVector<std::string>(
+            size,
+            [](auto row) -> std::string {
+              if (row % 2 == 0) {
+                return fmt::format("v-{}", row);
+              }
+              return fmt::format("velox_benchmark_string_{:08d}", row);
+            },
+            isNullAt,
+            type);
+      default:
+        VELOX_UNSUPPORTED("Unsupported benchmark type: {}", type->toString());
+    }
+  }
+};
+
+} // namespace
+
+/// Constructs all benchmark state and runs the benchmark. Called once per
+/// benchmark entry; construction is outside the timed region.
+void runBM(
+    uint32_t iterations,
+    const std::function<RowTypePtr(int32_t)>& rowTypeGenerator,
+    int32_t numColumns,
+    uint32_t numPartitions,
+    const std::function<bool(vector_size_t)>& isNullAt = noNulls,
+    vector_size_t numRows = 10'000) {
+  folly::BenchmarkSuspender suspender;
+  VectorBuilder vectorBuilder;
+  auto pool = memory::memoryManager()->addLeafPool();
+  PartitionBuildContext ctx;
+  auto vector = vectorBuilder.makeRowVector(
+      rowTypeGenerator(numColumns), numRows, isNullAt);
+  std::vector<uint32_t> partitions;
+  randomPartitionFunction(vector, numPartitions, partitions);
+  for (uint32_t i = 0; i < iterations; ++i) {
+    const auto vectorCopy = std::static_pointer_cast<RowVector>(
+        BaseVector::copy(*vector, pool.get()));
+    suspender.dismiss();
+    PartitionedVector::create(
+        vectorCopy, partitions, numPartitions, ctx, pool.get());
+    suspender.rehire();
+  }
+}
+
+#define BENCHMARK_CONFIG(name, generator, numCols, nulls, numParts) \
+  BENCHMARK_NAMED_PARAM(                                            \
+      runBM,                                                        \
+      name##_##numCols##Cols_##nulls##_P##numParts,                 \
+      generator,                                                    \
+      numCols,                                                      \
+      numParts,                                                     \
+      nulls);
+
+#define BENCHMARK_PARTITIONS(name, generator, numCols, nulls) \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 4)        \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 16)       \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 64)       \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 256)      \
+  BENCHMARK_CONFIG(name, generator, numCols, nulls, 1024)
+
+#define BENCHMARK_SIZES(name, generator, nulls)     \
+  BENCHMARK_PARTITIONS(name, generator, 1, nulls)   \
+  BENCHMARK_PARTITIONS(name, generator, 10, nulls)  \
+  BENCHMARK_PARTITIONS(name, generator, 100, nulls) \
+  BENCHMARK_PARTITIONS(name, generator, 1000, nulls)
+
+#define BENCHMARK_TYPE(name, generator)      \
+  BENCHMARK_SIZES(name, generator, noNulls)  \
+  BENCHMARK_SIZES(name, generator, allNulls) \
+  BENCHMARK_SIZES(name, generator, halfNulls)
+
+BENCHMARK_TYPE(BOOLEAN, scalarTypeGenerator<TypeKind::BOOLEAN>);
+BENCHMARK_TYPE(SMALLINT, scalarTypeGenerator<TypeKind::SMALLINT>);
+BENCHMARK_TYPE(INTEGER, scalarTypeGenerator<TypeKind::INTEGER>);
+BENCHMARK_TYPE(BIGINT, scalarTypeGenerator<TypeKind::BIGINT>);
+BENCHMARK_TYPE(HUGEINT, scalarTypeGenerator<TypeKind::HUGEINT>);
+BENCHMARK_TYPE(REAL, scalarTypeGenerator<TypeKind::REAL>);
+BENCHMARK_TYPE(DOUBLE, scalarTypeGenerator<TypeKind::DOUBLE>);
+BENCHMARK_TYPE(TIMESTAMP, scalarTypeGenerator<TypeKind::TIMESTAMP>);
+BENCHMARK_TYPE(VARCHAR, scalarTypeGenerator<TypeKind::VARCHAR>);
+BENCHMARK_TYPE(VARBINARY, scalarTypeGenerator<TypeKind::VARBINARY>);
+BENCHMARK_TYPE(DATE, dateTypeGenerator);
+BENCHMARK_TYPE(ShortDecimal, shortDecimalTypeGenerator);
+BENCHMARK_TYPE(LongDecimal, longDecimalTypeGenerator);
+BENCHMARK_TYPE(Mixed, mixedFlatTypeGenerator);
+
+} // namespace facebook::velox::test
+
+int main(int argc, char** argv) {
+  folly::Init init{&argc, &argv};
+  memory::MemoryManager::initialize(memory::MemoryManager::Options{});
+  folly::runBenchmarks();
+  return 0;
+}
diff --git a/velox/vector/tests/CMakeLists.txt b/velox/vector/tests/CMakeLists.txt
index 24478b9c8e5..08277820124 100644
--- a/velox/vector/tests/CMakeLists.txt
+++ b/velox/vector/tests/CMakeLists.txt
@@ -25,6 +25,7 @@ add_executable(
   LazyVectorTest.cpp
   MapConcatTest.cpp
   MayHaveNullsRecursiveTest.cpp
+  PartitionedVectorTest.cpp
   SelectivityVectorTest.cpp
   StringVectorBufferTest.cpp
   VariantToVectorTest.cpp
diff --git a/velox/vector/tests/PartitionedVectorTest.cpp b/velox/vector/tests/PartitionedVectorTest.cpp
new file mode 100644
index 00000000000..569a6e6ae9f
--- /dev/null
+++ b/velox/vector/tests/PartitionedVectorTest.cpp
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include "vector/tests/utils/VectorTestBase.h"
+#include "velox/vector/PartitionedVector.h"
+#include "velox/vector/tests/utils/PartitionedVectorTestBase.h"
+
+namespace facebook::velox::test {
+
+class PartitioningVectorTest : public testing::TestWithParam<int>,
+                               public test::PartitionedVectorTestBase {
+ protected:
+  std::mt19937 gen_ = std::mt19937(std::random_device{}());
+
+  PartitionBuildContext ctx_;
+  BufferPtr partitionOffsets_;
+
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance({});
+  }
+
+  void testPartitionedVector(
+      VectorPtr vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions) {
+    // Back up the vector before calling PartitionedVector::create()
+    VectorPtr vectorCopy = BaseVector::copy(*vector);
+    // Build the expected vector using the reference implementation
+    std::vector<VectorPtr> expectedVectors =
+        partitionVectorByWrapping(vectorCopy, partitions, numPartitions);
+
+    // Initialize buffers needed for PartitionedVector::create()
+    ensureCapacity<vector_size_t>(
+        ctx_.cursorPartitionOffsets, numPartitions, pool_.get());
+
+    // Calculate the number of values for each partition
+    std::vector<vector_size_t> partitionRowCounts(numPartitions, 0);
+    for (auto partition : partitions) {
+      partitionRowCounts[partition]++;
+    }
+
+    // Create the partitioned vector using the actual implementation
+    auto partitionedVector = PartitionedVector::create(
+        vector,
+        partitions,
+        numPartitions,
+        //        partitionOffsets_,
+        ctx_,
+        pool_.get());
+    VELOX_CHECK_NOT_NULL(partitionedVector);
+
+    // Extract each partition and compare with expected results
+    std::vector<VectorPtr> partitionedVectors;
+    for (uint32_t i = 0; i < numPartitions; ++i) {
+      auto partition = partitionedVector->partitionAt(i);
+      partitionedVectors.push_back(partition);
+    }
+
+    for (uint32_t i = 0; i < numPartitions; ++i) {
+      test::assertEqualVectors(
+          expectedVectors[i], canonicalize(partitionedVectors[i]));
+    }
+  }
+
+  void testVectorPartitioning(VectorPtr vector) {
+    auto numRows = vector->size();
+    std::vector<uint32_t> partitions(numRows);
+
+    // Test with single partition
+    std::fill(partitions.begin(), partitions.end(), 0);
+    auto vectorCopy = BaseVector::copy(*vector, pool_.get());
+    testPartitionedVector(vectorCopy, partitions, 1);
+
+    // Test with two partitions
+    if (vector->size() >= 3) {
+      for (uint32_t i = 0; i < partitions.size(); ++i) {
+        partitions[i] = i % 2;
+      }
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, 2);
+    }
+
+    // Test with three partitions
+    for (uint32_t i = 0; i < partitions.size(); ++i) {
+      partitions[i] = i % 3;
+    }
+    vectorCopy = BaseVector::copy(*vector, pool_.get());
+    testPartitionedVector(vectorCopy, partitions, 3);
+
+    if (vector->size() > 4) {
+      // Test with four partitions where the first partition is empty
+      for (uint32_t i = 0; i < partitions.size(); ++i) {
+        partitions[i] = i % 3 + 1;
+      }
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, 4);
+
+      // Test with four partitions where the last partition is empty
+      for (uint32_t i = 0; i < partitions.size(); ++i) {
+        partitions[i] = i % 3;
+      }
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, 4);
+    }
+
+    // Test with one value per partition
+    if (vector->size() > 0) {
+      std::iota(partitions.begin(), partitions.end(), 0);
+      vectorCopy = BaseVector::copy(*vector, pool_.get());
+      testPartitionedVector(vectorCopy, partitions, numRows);
+    }
+
+    // Test with random partitions (number of partitions <= number of values)
+    std::uniform_int_distribution<> dis(0, numRows - 1);
+    uint32_t maxPartition = 0;
+    for (uint32_t i = 0; i < numRows; ++i) {
+      partitions[i] = dis(gen_);
+      maxPartition = std::max(maxPartition, partitions[i]);
+    }
+    vectorCopy = BaseVector::copy(*vector, pool_.get());
+    testPartitionedVector(vectorCopy, partitions, maxPartition + 1);
+  }
+};
+
+TEST_P(PartitioningVectorTest, testFlatVector) {
+  // Number of values in the vector to be partitioned. This is passed as a test
+  // parameter and is used to test different vector sizes, including edge cases
+  // like 0 and 1.
+  const int numValues = GetParam();
+
+  // Random values, no nulls
+  testVectorPartitioning(
+      makeFlatVector<int>(numValues, [](auto row) { return row; }));
+
+  // Random values, with half number of nulls
+  testVectorPartitioning(
+      makeFlatVector<int>(
+          numValues, [](auto row) { return row; }, nullEvery(2, 1)));
+
+  // All nulls
+  testVectorPartitioning(makeAllNullFlatVector<int>(numValues));
+}
+
+TEST_P(PartitioningVectorTest, testFlatBoolVector) {
+  const int numValues = GetParam();
+
+  // Random values, no nulls
+  testVectorPartitioning(
+      makeFlatVector<bool>(numValues, [](auto row) { return row % 2 == 0; }));
+
+  // Random values, with half number of nulls
+  testVectorPartitioning(
+      makeFlatVector<bool>(
+          numValues, [](auto row) { return row % 2 == 0; }, nullEvery(2, 1)));
+
+  // All nulls
+  testVectorPartitioning(makeAllNullFlatVector<bool>(numValues));
+}
+
+TEST_P(PartitioningVectorTest, testRowVector) {
+  const int numValues = GetParam();
+
+  // Two flat columns, no nulls at any level.
+  testVectorPartitioning(makeRowVector({
+      makeFlatVector<int32_t>(numValues, [](auto row) { return row; }),
+      makeFlatVector<int64_t>(numValues, [](auto row) { return row * 10; }),
+  }));
+
+  // Two flat columns with nullable children.
+  testVectorPartitioning(makeRowVector({
+      makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(2)),
+      makeFlatVector<int64_t>(
+          numValues, [](auto row) { return row * 10; }, nullEvery(3)),
+  }));
+
+  // Row-level nulls with no child nulls.
+  testVectorPartitioning(makeRowVector(
+      {makeFlatVector<int32_t>(numValues, [](auto row) { return row; })},
+      nullEvery(2)));
+
+  // Row-level nulls combined with nullable children.
+  testVectorPartitioning(makeRowVector(
+      {makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(3))},
+      nullEvery(2)));
+
+  // All rows null.
+  testVectorPartitioning(makeRowVector(
+      {makeFlatVector<int32_t>(numValues, [](auto row) { return row; })},
+      [](auto /*row*/) { return true; }));
+
+  // Nested RowVector.
+  testVectorPartitioning(makeRowVector({
+      makeFlatVector<int32_t>(numValues, [](auto row) { return row; }),
+      makeRowVector({
+          makeFlatVector<int64_t>(numValues, [](auto row) { return row; }),
+      }),
+  }));
+}
+
+TEST_P(PartitioningVectorTest, testConstantVector) {
+  const int numValues = GetParam();
+
+  testVectorPartitioning(makeConstant<int32_t>(7, numValues));
+  testVectorPartitioning(makeConstant<int32_t>(std::nullopt, numValues));
+  testVectorPartitioning(makeConstantRow(
+      ROW({"c0", "c1"}, {INTEGER(), VARCHAR()}),
+      variant::row({variant(11), variant("constant")}),
+      numValues));
+}
+
+// Partitioning a null-free vector must not allocate a null buffer.
+TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeFlat) {
+  const int numValues = GetParam();
+  if (numValues == 0) {
+    return;
+  }
+
+  auto flat = makeFlatVector<int32_t>(numValues, [](auto row) { return row; });
+  ASSERT_FALSE(flat->mayHaveNulls());
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 2;
+  }
+
+  auto pv = PartitionedVector::create(flat, partitions, 2, ctx_, pool_.get());
+  EXPECT_FALSE(pv->baseVector()->mayHaveNulls())
+      << "partition() must not allocate a null buffer for a null-free FlatVector";
+}
+
+// Partitioning a null-free RowVector must not allocate null buffers on the
+// row vector or any of its children.
+TEST_P(PartitioningVectorTest, noNullBufferAllocatedForNullFreeRow) {
+  const int numValues = GetParam();
+  if (numValues == 0) {
+    return;
+  }
+
+  auto row = makeRowVector({
+      makeFlatVector<int32_t>(numValues, [](auto row) { return row; }),
+      makeFlatVector<int64_t>(numValues, [](auto row) { return row * 10; }),
+  });
+  ASSERT_FALSE(row->mayHaveNulls());
+  ASSERT_FALSE(row->childAt(0)->mayHaveNulls());
+  ASSERT_FALSE(row->childAt(1)->mayHaveNulls());
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 2;
+  }
+
+  auto pv = PartitionedVector::create(row, partitions, 2, ctx_, pool_.get());
+  auto* base = pv->baseVector()->as<RowVector>();
+  EXPECT_FALSE(base->mayHaveNulls())
+      << "partition() must not allocate a null buffer for a null-free RowVector";
+  EXPECT_FALSE(base->childAt(0)->mayHaveNulls())
+      << "partition() must not allocate a null buffer for null-free child 0";
+  EXPECT_FALSE(base->childAt(1)->mayHaveNulls())
+      << "partition() must not allocate a null buffer for null-free child 1";
+}
+
+// numNullsAt() tests
+// ---------------------------------------------------------------------------
+
+// A null-free flat vector must report zero nulls for every partition.
+TEST_P(PartitioningVectorTest, numNullsAtFlatNoNulls) {
+  const int numValues = GetParam();
+  auto flat = makeFlatVector<int32_t>(numValues, [](auto row) { return row; });
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get());
+  for (uint32_t p = 0; p < 3; ++p) {
+    EXPECT_EQ(pv->numNullsAt(p), 0) << "partition " << p;
+  }
+}
+
+// A flat vector with every other row null must report the exact per-partition
+// null count. The sum across all partitions must equal the total null count.
+TEST_P(PartitioningVectorTest, numNullsAtFlatSomeNulls) {
+  const int numValues = GetParam();
+  auto flat = makeFlatVector<int32_t>(
+      numValues, [](auto row) { return row; }, nullEvery(2));
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get());
+
+  // Per-partition counts must agree with manual bit-scan of the base vector.
+  const auto* rawNulls = pv->baseVector()->rawNulls();
+  const auto* rawOffsets = pv->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1];
+    const vector_size_t end = rawOffsets[p];
+    const vector_size_t expected = rawNulls
+        ? BaseVector::countNulls(pv->baseVector()->nulls(), begin, end)
+        : 0;
+    EXPECT_EQ(pv->numNullsAt(p), expected) << "partition " << p;
+  }
+
+  // Sum across partitions must equal the total null count in the source vector.
+  const vector_size_t total =
+      pv->numNullsAt(0) + pv->numNullsAt(1) + pv->numNullsAt(2);
+  EXPECT_EQ(total, BaseVector::countNulls(flat->nulls(), 0, numValues));
+}
+
+// An all-null flat vector must report numNullsAt(p) == rows in that partition.
+TEST_P(PartitioningVectorTest, numNullsAtFlatAllNulls) {
+  const int numValues = GetParam();
+  auto flat = makeAllNullFlatVector<int32_t>(numValues);
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(flat, partitions, 3, ctx_, pool_.get());
+
+  const auto* rawOffsets = pv->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1];
+    const vector_size_t numRowsInPartition = rawOffsets[p] - begin;
+    EXPECT_EQ(pv->numNullsAt(p), numRowsInPartition) << "partition " << p;
+  }
+}
+
+// A row vector with no row-level nulls must report zero per-partition nulls at
+// the row level, even when child columns have nulls.
+TEST_P(PartitioningVectorTest, numNullsAtRowNoRowLevelNulls) {
+  const int numValues = GetParam();
+  auto row = makeRowVector({
+      makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(2)),
+  });
+  ASSERT_FALSE(row->mayHaveNulls());
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(row, partitions, 3, ctx_, pool_.get());
+  for (uint32_t p = 0; p < 3; ++p) {
+    EXPECT_EQ(pv->numNullsAt(p), 0)
+        << "Row-level numNullsAt() must not count child nulls, partition " << p;
+  }
+}
+
+// A row vector with row-level nulls must report per-partition counts that match
+// a manual bit-scan. Child null counts must be counted independently.
+TEST_P(PartitioningVectorTest, numNullsAtRowRowLevelNulls) {
+  const int numValues = GetParam();
+  auto row = makeRowVector(
+      {makeFlatVector<int32_t>(
+          numValues, [](auto row) { return row; }, nullEvery(3))},
+      nullEvery(2));
+
+  std::vector<uint32_t> partitions(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    partitions[i] = i % 3;
+  }
+  auto pv = PartitionedVector::create(row, partitions, 3, ctx_, pool_.get());
+
+  const auto* rawOffsets = pv->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : rawOffsets[p - 1];
+    const vector_size_t end = rawOffsets[p];
+    const vector_size_t expected =
+        BaseVector::countNulls(pv->baseVector()->nulls(), begin, end);
+    EXPECT_EQ(pv->numNullsAt(p), expected)
+        << "Row-level null count mismatch, partition " << p;
+  }
+
+  // Child null counts must be tracked independently of row-level nulls.
+  auto* prv = dynamic_cast<PartitionedRowVector*>(pv.get());
+  ASSERT_NE(prv, nullptr);
+  auto child = prv->childAt(0);
+  const auto* childOffsets = child->rawPartitionOffsets();
+  for (uint32_t p = 0; p < 3; ++p) {
+    const vector_size_t begin = p == 0 ? 0 : childOffsets[p - 1];
+    const vector_size_t end = childOffsets[p];
+    const vector_size_t expected =
+        BaseVector::countNulls(child->baseVector()->nulls(), begin, end);
+    EXPECT_EQ(child->numNullsAt(p), expected)
+        << "Child null count mismatch, partition " << p;
+  }
+}
+
+// Test with different vector sizes, including edge cases like 0 and 1.
+INSTANTIATE_TEST_SUITE_P(
+    FlatVectorSizes,
+    PartitioningVectorTest,
+    ::testing::Values(0, 1, 10, 10000));
+
+} // namespace facebook::velox::test
diff --git a/velox/vector/tests/utils/CMakeLists.txt b/velox/vector/tests/utils/CMakeLists.txt
index 9e7fbae65b6..35a56901ccf 100644
--- a/velox/vector/tests/utils/CMakeLists.txt
+++ b/velox/vector/tests/utils/CMakeLists.txt
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-add_library(velox_vector_test_lib VectorMaker.cpp VectorTestBase.cpp)
+add_library(velox_vector_test_lib PartitionedVectorTestBase.cpp VectorMaker.cpp VectorTestBase.cpp)
 velox_add_test_headers(
   velox_vector_test_lib
   VectorMaker-inl.h
diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.cpp b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
new file mode 100644
index 00000000000..e9191ba0b8f
--- /dev/null
+++ b/velox/vector/tests/utils/PartitionedVectorTestBase.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/vector/tests/utils/PartitionedVectorTestBase.h"
+
+namespace facebook::velox::test {
+
+VectorPtr PartitionedVectorTestBase::canonicalize(VectorPtr vector) {
+  auto numRows = vector->size();
+
+  auto indices = makeIndices(numRows, [&](auto row) { return row; });
+  vector_size_t* indicesRange = indices->asMutable<vector_size_t>();
+
+  // Sort the indices based on the vector values
+  std::stable_sort(
+      indicesRange,
+      indicesRange + numRows,
+      [&](vector_size_t left, vector_size_t right) {
+        return vector->compare(vector.get(), left, right) < 0;
+      });
+
+  auto sortedVector = wrapInDictionary(indices, numRows, vector);
+  return sortedVector;
+}
+
+std::vector<VectorPtr> PartitionedVectorTestBase::partitionVectorByWrapping(
+    VectorPtr vector,
+    const std::vector<uint32_t>& partitions,
+    uint32_t numPartitions) {
+  auto numRows = vector->size();
+
+  // Count the number of rows in each partition
+  std::vector<uint32_t> partitionRowCounts(numPartitions, 0);
+  for (int i = 0; i < numRows; i++) {
+    partitionRowCounts[partitions[i]]++;
+  }
+
+  std::vector<VectorPtr> partitionedVectors(numPartitions, nullptr);
+
+  for (int p = 0; p < numPartitions; p++) {
+    auto numRowsInPartition = partitionRowCounts[p];
+
+    if (numRowsInPartition == 0) {
+      partitionedVectors[p] =
+          BaseVector::create(vector->type(), 0, pool_.get());
+      continue;
+    }
+
+    // Create an indices buffer for each partition, and fill it with the row
+    // indices for that partition.
+    std::vector<vector_size_t> rowIdsInPartition(numRowsInPartition);
+    vector_size_t offset = 0;
+    for (vector_size_t i = 0; i < numRows; ++i) {
+      if (partitions[i] == p) {
+        VELOX_DCHECK_LT(offset, numRowsInPartition);
+        rowIdsInPartition[offset++] = i;
+      }
+    }
+    VELOX_CHECK_EQ(offset, numRowsInPartition);
+    auto indices = makeIndices(partitionRowCounts[p], [&](auto row) {
+      return rowIdsInPartition[row];
+    });
+
+    // Simulate partitioning by building the DictionaryVector with the
+    // partitioned indices
+    // Copy firsts because wrapInDictionary would take the ownership of the
+    // vector
+    VectorPtr vectorCopy = BaseVector::copy(*vector, pool_.get());
+    auto dictionaryVector = BaseVector::wrapInDictionary(
+        nullptr, indices, numRowsInPartition, vectorCopy);
+    partitionedVectors[p] = canonicalize(dictionaryVector);
+  }
+  return partitionedVectors;
+}
+
+std::vector<VectorPtr> PartitionedVectorTestBase::partitionRowVectors(
+    const std::vector<RowVectorPtr>& rowVectors,
+    int32_t numPartitions,
+    core::PartitionFunction* partitionFunction) {
+  //  RowVectorPtr mergedRowVector = mergeRowVectors(rowVectors);
+  VectorPtr mergedRowVector =
+      mergeVectors((const std::vector<VectorPtr>&)rowVectors);
+  auto totalNumRows = mergedRowVector->size();
+
+  std::vector<uint32_t> partitions(totalNumRows, 0);
+  if (numPartitions > 1) {
+    auto rowType = asRowType(mergedRowVector->type());
+    std::optional<uint32_t> singlePartition = partitionFunction->partition(
+        *mergedRowVector->as<RowVector>(), partitions);
+    if (singlePartition.has_value()) {
+      // All rows go to the same partition
+      std::fill(partitions.begin(), partitions.end(), singlePartition.value());
+    }
+  }
+
+  std::vector<VectorPtr> partitionedVectors =
+      partitionVectorByWrapping(mergedRowVector, partitions, numPartitions);
+
+  for (auto& vector : partitionedVectors) {
+    vector = canonicalize(vector);
+  }
+  return partitionedVectors;
+}
+
+VectorPtr PartitionedVectorTestBase::mergeVectors(
+    const std::vector<VectorPtr>& vectors) {
+  // We have to count the total number of rows first in order to allocate the
+  // mergedRowVector.
+  auto mergedVector = BaseVector::copy(*vectors[0]);
+  for (auto i = 1; i < vectors.size(); ++i) {
+    mergedVector->append(vectors[i].get());
+  }
+
+  return mergedVector;
+}
+
+} // namespace facebook::velox::test
diff --git a/velox/vector/tests/utils/PartitionedVectorTestBase.h b/velox/vector/tests/utils/PartitionedVectorTestBase.h
new file mode 100644
index 00000000000..b2c50761edc
--- /dev/null
+++ b/velox/vector/tests/utils/PartitionedVectorTestBase.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/core/PlanNode.h"
+#include "velox/vector/PartitionedVector.h"
+#include "velox/vector/tests/utils/VectorTestBase.h"
+
+namespace facebook::velox::test {
+
+class PartitionedVectorTestBase : public VectorTestBase {
+ protected:
+  std::vector<VectorPtr> partitionVectorByWrapping(
+      VectorPtr vector,
+      const std::vector<uint32_t>& partitions,
+      uint32_t numPartitions);
+
+  std::vector<VectorPtr> partitionRowVectors(
+      const std::vector<RowVectorPtr>& rowVectors,
+      int32_t numPartitions,
+      core::PartitionFunction* partitionFunction);
+
+  VectorPtr canonicalize(VectorPtr vector);
+
+  VectorPtr mergeVectors(const std::vector<VectorPtr>& vectors);
+};
+
+} // namespace facebook::velox::test