apache
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 12 additions & 0 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/source/contributor-guide/native_shuffle.md‎
Lines changed: 70 additions & 28 deletions b/‎docs/source/contributor-guide/native_shuffle.md‎
Lines changed: 70 additions & 28 deletions
diff --git a/‎docs/source/user-guide/latest/tuning.md‎
Lines changed: 11 additions & 0 deletions b/‎docs/source/user-guide/latest/tuning.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎native/core/src/execution/planner.rs‎
Lines changed: 1 addition & 0 deletions b/‎native/core/src/execution/planner.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎native/proto/src/proto/operator.proto‎
Lines changed: 4 additions & 0 deletions b/‎native/proto/src/proto/operator.proto‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎native/shuffle/README.md‎
Lines changed: 17 additions & 17 deletions b/‎native/shuffle/README.md‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎native/shuffle/benches/shuffle_writer.rs‎
Lines changed: 1 addition & 0 deletions b/‎native/shuffle/benches/shuffle_writer.rs‎
Lines changed: 1 addition & 0 deletions
@@ -523,6 +523,18 @@ object CometConf extends ShimCometConf {
         "Should not be larger than batch size `spark.comet.batchSize`")
       .createWithDefault(8192)
 
+  val COMET_SHUFFLE_PARTITIONER_MODE: ConfigEntry[String] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.partitionerMode")
+      .category(CATEGORY_SHUFFLE)
+      .doc(
+        "The partitioner mode used by the native shuffle writer. " +
+          "'immediate' writes partitioned IPC blocks immediately as batches arrive, " +
+          "reducing memory usage. 'buffered' buffers all rows before writing, which may " +
+          "improve performance for small datasets but uses more memory.")
+      .stringConf
+      .checkValues(Set("immediate", "buffered"))
+      .createWithDefault("buffered")
+
   val COMET_SHUFFLE_WRITE_BUFFER_SIZE: ConfigEntry[Long] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.writeBufferSize")
       .category(CATEGORY_SHUFFLE)
 
@@ -81,10 +81,18 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 └─────────────────────────────────────────────────────────────────────────────┘
                     │                                     │
                     ▼                                     ▼
-┌───────────────────────────────────┐   ┌───────────────────────────────────┐
-│ MultiPartitionShuffleRepartitioner │   │ SinglePartitionShufflePartitioner │
-│ (hash/range partitioning)          │   │ (single partition case)           │
-└───────────────────────────────────┘   └───────────────────────────────────┘
+┌───────────────────────────────────────────────────────────────────────┐
+│                        Partitioner Selection                          │
+│  Controlled by spark.comet.exec.shuffle.partitionerMode               │
+├───────────────────────────┬───────────────────────────────────────────┤
+│  immediate (default)      │  buffered                                 │
+│  ImmediateModePartitioner │  MultiPartitionShuffleRepartitioner       │
+│  (hash/range/round-robin) │  (hash/range/round-robin)                 │
+│  Writes IPC blocks as     │  Buffers all rows in memory               │
+│  batches arrive           │  before writing                           │
+├───────────────────────────┴───────────────────────────────────────────┤
+│  SinglePartitionShufflePartitioner (single partition case)            │
+└───────────────────────────────────────────────────────────────────────┘
                     │
                     ▼
 ┌───────────────────────────────────┐
@@ -113,11 +121,13 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 
 ### Rust Side
 
-| File                    | Location                             | Description                                                                          |
-| ----------------------- | ------------------------------------ | ------------------------------------------------------------------------------------ |
-| `shuffle_writer.rs`     | `native/core/src/execution/shuffle/` | `ShuffleWriterExec` plan and partitioners. Main shuffle logic.                       |
-| `codec.rs`              | `native/core/src/execution/shuffle/` | `ShuffleBlockWriter` for Arrow IPC encoding with compression. Also handles decoding. |
-| `comet_partitioning.rs` | `native/core/src/execution/shuffle/` | `CometPartitioning` enum defining partition schemes (Hash, Range, Single).           |
+| File                    | Location                           | Description                                                                                                                            |
+| ----------------------- | ---------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `shuffle_writer.rs`     | `native/shuffle/src/`              | `ShuffleWriterExec` plan. Selects partitioner based on `immediate_mode` flag.                                                          |
+| `immediate_mode.rs`     | `native/shuffle/src/partitioners/` | `ImmediateModePartitioner`. Scatter-writes rows into per-partition Arrow builders and flushes IPC blocks to in-memory buffers eagerly. |
+| `multi_partition.rs`    | `native/shuffle/src/partitioners/` | `MultiPartitionShuffleRepartitioner`. Buffers all rows in memory, then writes partitions.                                              |
+| `codec.rs`              | `native/shuffle/src/`              | `ShuffleBlockWriter` for Arrow IPC encoding with compression. Also handles decoding.                                                   |
+| `comet_partitioning.rs` | `native/shuffle/src/`              | `CometPartitioning` enum defining partition schemes (Hash, Range, Single).                                                             |
 
 ## Data Flow
 
@@ -129,23 +139,33 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 
 2. **Native execution**: `CometExec.getCometIterator()` executes the plan in Rust.
 
-3. **Partitioning**: `ShuffleWriterExec` receives batches and routes to the appropriate partitioner:
-   - `MultiPartitionShuffleRepartitioner`: For hash/range/round-robin partitioning
-   - `SinglePartitionShufflePartitioner`: For single partition (simpler path)
+3. **Partitioning**: `ShuffleWriterExec` receives batches and routes to the appropriate partitioner
+   based on the `partitionerMode` configuration:
+   - **Immediate mode** (`ImmediateModePartitioner`): For hash/range/round-robin partitioning.
+     As each batch arrives, rows are scattered into per-partition Arrow array builders. When a
+     partition's builder reaches the target batch size, it is flushed as a compressed Arrow IPC
+     block to an in-memory buffer. Under memory pressure, these buffers are spilled to
+     per-partition temporary files. This keeps memory usage much lower than buffered mode since
+     data is encoded into compact IPC format eagerly rather than held as raw Arrow arrays.
 
-4. **Buffering and spilling**: The partitioner buffers rows per partition. When memory pressure
-   exceeds the threshold, partitions spill to temporary files.
+   - **Buffered mode** (`MultiPartitionShuffleRepartitioner`): For hash/range/round-robin
+     partitioning. Buffers all input `RecordBatch`es in memory, then partitions and writes
+     them in a single pass. When memory pressure exceeds the threshold, partitions spill to
+     temporary files.
 
-5. **Encoding**: `ShuffleBlockWriter` encodes each partition's data as compressed Arrow IPC:
+   - `SinglePartitionShufflePartitioner`: For single partition (simpler path, used regardless
+     of partitioner mode).
+
+4. **Encoding**: `ShuffleBlockWriter` encodes each partition's data as compressed Arrow IPC:
    - Writes compression type header
    - Writes field count header
    - Writes compressed IPC stream
 
-6. **Output files**: Two files are produced:
+5. **Output files**: Two files are produced:
    - **Data file**: Concatenated partition data
    - **Index file**: Array of 8-byte little-endian offsets marking partition boundaries
 
-7. **Commit**: Back in JVM, `CometNativeShuffleWriter` reads the index file to get partition
+6. **Commit**: Back in JVM, `CometNativeShuffleWriter` reads the index file to get partition
    lengths and commits via Spark's `IndexShuffleBlockResolver`.
 
 ### Read Path
@@ -201,10 +221,31 @@ sizes.
 
 ## Memory Management
 
-Native shuffle uses DataFusion's memory management with spilling support:
+Native shuffle uses DataFusion's memory management. The memory characteristics differ
+between the two partitioner modes:
+
+### Immediate Mode
+
+Immediate mode keeps memory usage low by partitioning and encoding data eagerly as it arrives,
+rather than buffering all input rows before writing:
+
+- **Per-partition builders**: Each partition has a set of Arrow array builders sized to the
+  target batch size. When a builder fills up, it is flushed as a compressed IPC block to an
+  in-memory buffer.
+- **Memory footprint**: Proportional to `num_partitions × batch_size` for the builders, plus
+  the accumulated IPC buffers. This is typically much smaller than buffered mode since IPC
+  encoding is more compact than raw Arrow arrays.
+- **Spilling**: When memory pressure is detected via DataFusion's `MemoryConsumer` trait,
+  partition builders are flushed and all IPC buffers are drained to per-partition temporary
+  files on disk.
+
+### Buffered Mode
+
+Buffered mode holds all input data in memory before writing:
 
-- **Memory pool**: Tracks memory usage across the shuffle operation.
-- **Spill threshold**: When buffered data exceeds the threshold, partitions spill to disk.
+- **Buffered batches**: All incoming `RecordBatch`es are accumulated in a `Vec`.
+- **Spill threshold**: When buffered data exceeds the memory threshold, partitions spill to
+  temporary files on disk.
 - **Per-partition spilling**: Each partition has its own spill file. Multiple spills for a
   partition are concatenated when writing the final output.
 - **Scratch space**: Reusable buffers for partition ID computation to reduce allocations.
@@ -232,14 +273,15 @@ independently compressed, allowing parallel decompression during reads.
 
 ## Configuration
 
-| Config                                            | Default | Description                              |
-| ------------------------------------------------- | ------- | ---------------------------------------- |
-| `spark.comet.exec.shuffle.enabled`                | `true`  | Enable Comet shuffle                     |
-| `spark.comet.exec.shuffle.mode`                   | `auto`  | Shuffle mode: `native`, `jvm`, or `auto` |
-| `spark.comet.exec.shuffle.compression.codec`      | `zstd`  | Compression codec                        |
-| `spark.comet.exec.shuffle.compression.zstd.level` | `1`     | Zstd compression level                   |
-| `spark.comet.shuffle.write.buffer.size`           | `1MB`   | Write buffer size                        |
-| `spark.comet.columnar.shuffle.batch.size`         | `8192`  | Target rows per batch                    |
+| Config                                            | Default     | Description                                 |
+| ------------------------------------------------- | ----------- | ------------------------------------------- |
+| `spark.comet.exec.shuffle.enabled`                | `true`      | Enable Comet shuffle                        |
+| `spark.comet.exec.shuffle.mode`                   | `auto`      | Shuffle mode: `native`, `jvm`, or `auto`    |
+| `spark.comet.exec.shuffle.partitionerMode`        | `immediate` | Partitioner mode: `immediate` or `buffered` |
+| `spark.comet.exec.shuffle.compression.codec`      | `zstd`      | Compression codec                           |
+| `spark.comet.exec.shuffle.compression.zstd.level` | `1`         | Zstd compression level                      |
+| `spark.comet.shuffle.write.buffer.size`           | `1MB`       | Write buffer size                           |
+| `spark.comet.columnar.shuffle.batch.size`         | `8192`      | Target rows per batch                       |
 
 ## Comparison with JVM Shuffle
 
 
@@ -144,6 +144,17 @@ Comet provides a fully native shuffle implementation, which generally provides t
 supports `HashPartitioning`, `RangePartitioning` and `SinglePartitioning` but currently only supports primitive type
 partitioning keys. Columns that are not partitioning keys may contain complex types like maps, structs, and arrays.
 
+Native shuffle has two partitioner modes, configured via
+`spark.comet.exec.shuffle.partitionerMode`:
+
+- **`immediate`** (default): Writes partitioned Arrow IPC blocks to disk immediately as each batch
+  arrives. This mode uses less memory because it does not need to buffer the entire input before
+  writing. It is recommended for most workloads, especially large datasets.
+
+- **`buffered`**: Buffers all input rows in memory before partitioning and writing to disk. This
+  may improve performance for small datasets that fit in memory, but uses significantly more
+  memory.
+
 #### Columnar (JVM) Shuffle
 
 Comet Columnar shuffle is JVM-based and supports `HashPartitioning`, `RoundRobinPartitioning`, `RangePartitioning`, and
 
@@ -1379,6 +1379,7 @@ impl PhysicalPlanner {
                     writer.output_index_file.clone(),
                     writer.tracing_enabled,
                     write_buffer_size,
+                    writer.immediate_mode,
                 )?);
 
                 Ok((
 
@@ -294,6 +294,10 @@ message ShuffleWriter {
   // Size of the write buffer in bytes used when writing shuffle data to disk.
   // Larger values may improve write performance but use more memory.
   int32 write_buffer_size = 8;
+  // Whether to use immediate mode partitioner. When true, partitioned IPC blocks
+  // are written immediately as batches arrive. When false, rows are buffered
+  // before writing (the original behavior).
+  bool immediate_mode = 9;
 }
 
 message ParquetWriter {
 
@@ -35,32 +35,32 @@ performance outside of Spark. It streams input data directly from Parquet files.
 cargo run --release --features shuffle-bench --bin shuffle_bench -- \
   --input /data/tpch-sf100/lineitem/ \
   --partitions 200 \
-  --codec lz4 \
+  --codec zstd --zstd-level 1 \
   --hash-columns 0,3
 ```
 
 ### Options
 
-| Option                | Default                    | Description                                            |
-| --------------------- | -------------------------- | ------------------------------------------------------ |
-| `--input`             | _(required)_               | Path to a Parquet file or directory of Parquet files   |
-| `--partitions`        | `200`                      | Number of output shuffle partitions                    |
-| `--partitioning`      | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`   |
-| `--hash-columns`      | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`) |
-| `--codec`             | `lz4`                      | Compression codec: `none`, `lz4`, `zstd`, `snappy`     |
-| `--zstd-level`        | `1`                        | Zstd compression level (1–22)                          |
-| `--batch-size`        | `8192`                     | Batch size for reading Parquet data                    |
-| `--memory-limit`      | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded |
-| `--write-buffer-size` | `1048576`                  | Write buffer size in bytes                             |
-| `--limit`             | `0`                        | Limit rows processed per iteration (0 = no limit)      |
-| `--iterations`        | `1`                        | Number of timed iterations                             |
-| `--warmup`            | `0`                        | Number of warmup iterations before timing              |
-| `--output-dir`        | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files           |
+| Option                   | Default                    | Description                                                  |
+| ------------------------ | -------------------------- | ------------------------------------------------------------ |
+| `--input`                | _(required)_               | Path to a Parquet file or directory of Parquet files         |
+| `--partitions`           | `200`                      | Number of output shuffle partitions                          |
+| `--partitioning`         | `hash`                     | Partitioning scheme: `hash`, `single`, `round-robin`         |
+| `--hash-columns`         | `0`                        | Comma-separated column indices to hash on (e.g. `0,3`)       |
+| `--codec`                | `zstd`                     | Compression codec: `none`, `lz4`, `zstd`, `snappy`           |
+| `--zstd-level`           | `1`                        | Zstd compression level (1–22)                                |
+| `--batch-size`           | `8192`                     | Batch size for reading Parquet data                          |
+| `--memory-limit`         | _(none)_                   | Memory limit in bytes; triggers spilling when exceeded       |
+| `--write-buffer-size`    | `1048576`                  | Write buffer size in bytes                                   |
+| `--limit`                | `0`                        | Limit rows processed per iteration (0 = no limit)            |
+| `--iterations`           | `1`                        | Number of timed iterations                                   |
+| `--warmup`               | `0`                        | Number of warmup iterations before timing                    |
+| `--output-dir`           | `/tmp/comet_shuffle_bench` | Directory for temporary shuffle output files                 |
 
 ### Profiling with flamegraph
 
 ```sh
 cargo flamegraph --release --features shuffle-bench --bin shuffle_bench -- \
   --input /data/tpch-sf100/lineitem/ \
-  --partitions 200 --codec lz4
+  --partitions 200 --codec zstd --zstd-level 1
 ```
@@ -153,6 +153,7 @@ fn create_shuffle_writer_exec(
         "/tmp/index.out".to_string(),
         false,
         1024 * 1024,
+        false, // immediate_mode
     )
     .unwrap()
 }
Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,7 @@ fn create_shuffle_writer_exec(`
`153`	`153`	`"/tmp/index.out".to_string(),`
`154`	`154`	`false,`
`155`	`155`	`1024 * 1024,`
	`156`	`+ false, // immediate_mode`
`156`	`157`	`)`
`157`	`158`	`.unwrap()`
`158`	`159`	`}`