[VL] Reduce Velox hash shuffle partition buffer memory by evicting large partitions after split (#12089)

wankunde · web-flow · commit 0b1e5115e5a2 · 2026-05-19T09:57:41.000+01:00
diff --git a/backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala b/backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala
@@ -150,6 +150,7 @@ class VeloxCelebornColumnarShuffleWriter[K, V](
           GlutenShuffleUtils.getStartPartitionId(dep.nativePartitioning, context.partitionId),
           nativeBufferSize,
           GlutenConfig.get.columnarShuffleReallocThreshold,
+          GlutenConfig.get.columnarShufflePartitionBufferEvictThreshold,
           partitionWriterHandle
         )
       case SortShuffleWriterType =>
diff --git a/backends-velox/src-uniffle/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java b/backends-velox/src-uniffle/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java
@@ -185,6 +185,7 @@ protected void writeImpl(Iterator<Product2<K, V>> records) {
                         columnarDep.nativePartitioning(), partitionId),
                     nativeBufferSize,
                     reallocThreshold,
+                    GlutenConfig.get().columnarShufflePartitionBufferEvictThreshold(),
                     partitionWriterHandle);
           }
 
diff --git a/backends-velox/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala b/backends-velox/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala
@@ -192,6 +192,7 @@ class ColumnarShuffleWriter[K, V](
                 taskContext.partitionId),
               nativeBufferSize,
               reallocThreshold,
+              GlutenConfig.get.columnarShufflePartitionBufferEvictThreshold,
               partitionWriterHandle
             )
           }
diff --git a/cpp/core/jni/JniWrapper.cc b/cpp/core/jni/JniWrapper.cc
@@ -990,6 +990,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe
     jint startPartitionId,
     jint splitBufferSize,
     jdouble splitBufferReallocThreshold,
+    jint partitionBufferEvictThreshold,
     jlong partitionWriterHandle) {
   JNI_METHOD_START
   const auto ctx = getRuntime(env, wrapper);
@@ -1004,7 +1005,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrappe
       toPartitioning(jStringToCString(env, partitioningNameJstr)),
       startPartitionId,
       splitBufferSize,
-      splitBufferReallocThreshold);
+      splitBufferReallocThreshold,
+      partitionBufferEvictThreshold);
 
   return ctx->saveObject(ctx->createShuffleWriter(numPartitions, partitionWriter, shuffleWriterOptions));
   JNI_METHOD_END(kInvalidObjectHandle)
diff --git a/cpp/core/shuffle/Options.h b/cpp/core/shuffle/Options.h
@@ -27,6 +27,7 @@
 namespace gluten {
 
 static constexpr int16_t kDefaultBatchSize = 4096;
+static constexpr int32_t kDefaultPartitionBufferEvictThreshold = -1;
 static constexpr int32_t kDefaultShuffleWriterBufferSize = 4096;
 static constexpr int64_t kDefaultSortBufferThreshold = 64 << 20;
 static constexpr int64_t kDefaultPushMemoryThreshold = 4096;
@@ -85,17 +86,20 @@ struct ShuffleWriterOptions {
 struct HashShuffleWriterOptions : ShuffleWriterOptions {
   int32_t splitBufferSize = kDefaultShuffleWriterBufferSize;
   double splitBufferReallocThreshold = kDefaultSplitBufferReallocThreshold;
+  int32_t partitionBufferEvictThreshold = kDefaultPartitionBufferEvictThreshold;
 
   HashShuffleWriterOptions() : ShuffleWriterOptions(ShuffleWriterType::kHashShuffle) {}
 
   HashShuffleWriterOptions(
       Partitioning partitioning,
       int32_t startPartitionId,
       int32_t partitionBufferSize,
-      double partitionBufferReallocThreshold)
+      double partitionBufferReallocThreshold,
+      int32_t partitionBufferEvictThreshold = kDefaultPartitionBufferEvictThreshold)
       : ShuffleWriterOptions(ShuffleWriterType::kHashShuffle, partitioning, startPartitionId),
         splitBufferSize(partitionBufferSize),
-        splitBufferReallocThreshold(partitionBufferReallocThreshold) {}
+        splitBufferReallocThreshold(partitionBufferReallocThreshold),
+        partitionBufferEvictThreshold(partitionBufferEvictThreshold) {}
 
  protected:
   HashShuffleWriterOptions(ShuffleWriterType shuffleWriterType) : ShuffleWriterOptions(shuffleWriterType) {}
@@ -105,10 +109,12 @@ struct HashShuffleWriterOptions : ShuffleWriterOptions {
       Partitioning partitioning,
       int32_t startPartitionId,
       int32_t partitionBufferSize,
-      double partitionBufferReallocThreshold)
+      double partitionBufferReallocThreshold,
+      int32_t partitionBufferEvictThreshold = kDefaultPartitionBufferEvictThreshold)
       : ShuffleWriterOptions(shuffleWriterType, partitioning, startPartitionId),
         splitBufferSize(partitionBufferSize),
-        splitBufferReallocThreshold(partitionBufferReallocThreshold) {}
+        splitBufferReallocThreshold(partitionBufferReallocThreshold),
+        partitionBufferEvictThreshold(partitionBufferEvictThreshold) {}
 };
 
 struct SortShuffleWriterOptions : ShuffleWriterOptions {
diff --git a/cpp/core/shuffle/Payload.cc b/cpp/core/shuffle/Payload.cc
@@ -60,7 +60,7 @@ arrow::Result<uint8_t> readPayloadType(arrow::io::InputStream* is) {
 }
 
 arrow::Result<int64_t> compressBuffer(
-    const std::shared_ptr<arrow::Buffer>& buffer,
+    const std::shared_ptr<arrow::Buffer> buffer,
     uint8_t* output,
     int64_t outputLength,
     arrow::util::Codec* codec) {
diff --git a/cpp/velox/shuffle/VeloxHashShuffleWriter.cc b/cpp/velox/shuffle/VeloxHashShuffleWriter.cc
@@ -441,9 +441,41 @@ arrow::Status VeloxHashShuffleWriter::doSplit(const facebook::velox::RowVector&
   printPartitionBuffer();
 
   setSplitState(SplitState::kInit);
+  if (partitionBufferEvictThreshold_ > 0) {
+    // After split, evict large partition buffers to free up memory for the next input RowVector.
+    const auto partitionBytes = estimatePartitionBufferBytes();
+    for (uint32_t pid = 0; pid < partitionBytes.size(); ++pid) {
+      if (partitionBufferBase_[pid] > 0 && partitionBytes[pid] >= partitionBufferEvictThreshold_) {
+        RETURN_NOT_OK(evictPartitionBuffers(pid, false));
+      }
+    }
+  }
   return arrow::Status::OK();
 }
 
+std::vector<int64_t> VeloxHashShuffleWriter::estimatePartitionBufferBytes() const {
+  std::vector<int64_t> partitionBytes(numPartitions_, 0);
+
+  for (const auto& columnBuffers : partitionBuffers_) {
+    for (uint32_t pid = 0; pid < columnBuffers.size(); ++pid) {
+      for (const auto& buffer : columnBuffers[pid]) {
+        if (buffer) {
+          partitionBytes[pid] += buffer->capacity();
+        }
+      }
+    }
+  }
+
+  for (uint32_t pid = 0; pid < complexTypeFlushBuffer_.size(); ++pid) {
+    const auto& buffer = complexTypeFlushBuffer_[pid];
+    if (buffer) {
+      partitionBytes[pid] += buffer->capacity();
+    }
+  }
+
+  return partitionBytes;
+}
+
 arrow::Status VeloxHashShuffleWriter::splitRowVector(const facebook::velox::RowVector& rv) {
   SCOPED_TIMER(cpuWallTimingList_[CpuWallTimingSplitRV]);
 
diff --git a/cpp/velox/shuffle/VeloxHashShuffleWriter.h b/cpp/velox/shuffle/VeloxHashShuffleWriter.h
@@ -278,7 +278,8 @@ class VeloxHashShuffleWriter : public VeloxShuffleWriter {
       MemoryManager* memoryManager)
       : VeloxShuffleWriter(numPartitions, partitionWriter, options, memoryManager),
         splitBufferSize_(options->splitBufferSize),
-        splitBufferReallocThreshold_(options->splitBufferReallocThreshold) {
+        splitBufferReallocThreshold_(options->splitBufferReallocThreshold),
+        partitionBufferEvictThreshold_(options->partitionBufferEvictThreshold) {
     arenas_.resize(numPartitions);
   }
 
@@ -287,6 +288,8 @@ class VeloxHashShuffleWriter : public VeloxShuffleWriter {
 
   arrow::Status initColumnTypes(const facebook::velox::RowVector& rv);
 
+  std::vector<int64_t> estimatePartitionBufferBytes() const;
+
   arrow::Status splitRowVector(const facebook::velox::RowVector& rv);
 
   arrow::Status initFromRowVector(const facebook::velox::RowVector& rv);
@@ -396,6 +399,7 @@ class VeloxHashShuffleWriter : public VeloxShuffleWriter {
  protected:
   int32_t splitBufferSize_;
   double splitBufferReallocThreshold_;
+  int32_t partitionBufferEvictThreshold_;
 
   std::shared_ptr<arrow::Schema> schema_;
 
diff --git a/docs/Configuration.md b/docs/Configuration.md
@@ -95,6 +95,7 @@ nav_order: 15
 | spark.gluten.sql.columnar.shuffle.compression.threshold             | 100               | If number of rows in a batch falls below this threshold, will copy all buffers into one buffer to compress.                                                                                                                                                                                                                                                                                                                                        |
 | spark.gluten.sql.columnar.shuffle.dictionary.enabled                | false             | Enable dictionary in hash-based shuffle.                                                                                                                                                                                                                                                                                                                                                                                                           |
 | spark.gluten.sql.columnar.shuffle.merge.threshold                   | 0.25              |
+| spark.gluten.sql.columnar.shuffle.partitionBufferEvictThreshold     | -1                | For Velox hash shuffle writer, evict partition buffers larger than this threshold after splitting an input batch. Use non-positive value to disable this feature.                                                                                                                                                                                                                                                                                  |
 | spark.gluten.sql.columnar.shuffle.readerBufferSize                  | 1MB               | Buffer size in bytes for shuffle reader reading input stream from local or remote.                                                                                                                                                                                                                                                                                                                                                                 |
 | spark.gluten.sql.columnar.shuffle.realloc.threshold                 | 0.25              |
 | spark.gluten.sql.columnar.shuffle.sort.columns.threshold            | 100000            | The threshold to determine whether to use sort-based columnar shuffle. Sort-based shuffle will be used if the number of columns is greater than this threshold.                                                                                                                                                                                                                                                                                    |
diff --git a/gluten-arrow/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java b/gluten-arrow/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java
@@ -43,6 +43,7 @@ public native long createHashShuffleWriter(
       int startPartitionId,
       int splitBufferSize,
       double splitBufferReallocThreshold,
+      int partitionBufferEvictThreshold,
       long partitionWriterHandle);
 
   public native long createSortShuffleWriter(
diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -225,6 +225,9 @@ class GlutenConfig(conf: SQLConf) extends GlutenCoreConfig(conf) {
 
   def columnarShuffleReallocThreshold: Double = getConf(COLUMNAR_SHUFFLE_REALLOC_THRESHOLD)
 
+  def columnarShufflePartitionBufferEvictThreshold: Int =
+    getConf(COLUMNAR_SHUFFLE_PARTITION_BUFFER_EVICT_THRESHOLD)
+
   def columnarShuffleMergeThreshold: Double = getConf(SHUFFLE_WRITER_MERGE_THRESHOLD)
 
   def columnarShuffleCodec: Option[String] = getConf(COLUMNAR_SHUFFLE_CODEC)
@@ -1074,6 +1077,14 @@ object GlutenConfig extends ConfigRegistry {
       .checkValue(v => v >= 0 && v <= 1, "Buffer reallocation threshold must between [0, 1]")
       .createWithDefault(0.25)
 
+  val COLUMNAR_SHUFFLE_PARTITION_BUFFER_EVICT_THRESHOLD =
+    buildConf("spark.gluten.sql.columnar.shuffle.partitionBufferEvictThreshold")
+      .doc(
+        "For Velox hash shuffle writer, evict partition buffers larger than this threshold " +
+          "after splitting an input batch. Use non-positive value to disable this feature.")
+      .intConf
+      .createWithDefault(-1)
+
   val COLUMNAR_SHUFFLE_CODEC =
     buildConf("spark.gluten.sql.columnar.shuffle.codec")
       .doc(

Original file line number	Diff line number	Diff line change
`@@ -150,6 +150,7 @@ class VeloxCelebornColumnarShuffleWriter[K, V](`
`150`	`150`	`GlutenShuffleUtils.getStartPartitionId(dep.nativePartitioning, context.partitionId),`
`151`	`151`	`nativeBufferSize,`
`152`	`152`	`GlutenConfig.get.columnarShuffleReallocThreshold,`
	`153`	`+ GlutenConfig.get.columnarShufflePartitionBufferEvictThreshold,`
`153`	`154`	`partitionWriterHandle`
`154`	`155`	`)`
`155`	`156`	`case SortShuffleWriterType =>`
Original file line number	Diff line number	Diff line change
`@@ -185,6 +185,7 @@ protected void writeImpl(Iterator<Product2<K, V>> records) {`
`185`	`185`	`columnarDep.nativePartitioning(), partitionId),`
`186`	`186`	`nativeBufferSize,`
`187`	`187`	`reallocThreshold,`
	`188`	`+ GlutenConfig.get().columnarShufflePartitionBufferEvictThreshold(),`
`188`	`189`	`partitionWriterHandle);`
`189`	`190`	`}`
`190`	`191`
Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,7 @@ class ColumnarShuffleWriter[K, V](`
`192`	`192`	`taskContext.partitionId),`
`193`	`193`	`nativeBufferSize,`
`194`	`194`	`reallocThreshold,`
	`195`	`+ GlutenConfig.get.columnarShufflePartitionBufferEvictThreshold,`
`195`	`196`	`partitionWriterHandle`
`196`	`197`	`)`
`197`	`198`	`}`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ arrow::Result<uint8_t> readPayloadType(arrow::io::InputStream* is) {`
`60`	`60`	`}`
`61`	`61`
`62`	`62`	`arrow::Result<int64_t> compressBuffer(`
`63`		`- const std::shared_ptr<arrow::Buffer>& buffer,`
	`63`	`+ const std::shared_ptr<arrow::Buffer> buffer,`
`64`	`64`	`uint8_t* output,`
`65`	`65`	`int64_t outputLength,`
`66`	`66`	`arrow::util::Codec* codec) {`