fix: bump fast-path default, fix size inflation, fix new clippy lints

andygrove · andygrove · commit b62d68d4b76c · 2026-05-02T11:14:14.000-06:00
- Replace per-batch get_array_memory_size sum with a row-count x schema-
  derived estimate for the fast-path threshold decision. The Arrow helper
  reports the full underlying buffer for every zero-copy slice, so a 49 MB
  build can look like 97 MB after a shuffle read and spuriously miss the
  threshold. The row-count estimate has no such cross-slice double-counting.
  Added unit tests showing the slice-inflation behavior.
- Bump spark.comet.exec.graceHashJoin.fastPathThreshold default from
  64 MB to 512 MB executor-wide, since the prior default yielded only
  8 MB per task on a typical 8-core executor (4 MB in local[16]) which
  was smaller than most real build sides.
- Fix clippy 1.95 errors (collapse if-in-match-arm to match guards in
  join_with_spilled_probe; drop redundant .into_iter() in zip call).
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -317,7 +317,7 @@ object CometConf extends ShimCometConf {
           "Larger values risk OOM because HashJoinExec creates non-spillable hash tables.")
       .longConf
       .checkValue(v => v >= 0, "The fast path threshold must be non-negative.")
-      .createWithDefault(64L * 1024 * 1024) // 64 MB
+      .createWithDefault(512L * 1024 * 1024) // 512 MB
 
   val COMET_EXEC_GRACE_HASH_JOIN_MAX_CONCURRENT_PARTITIONS: ConfigEntry[Int] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.graceHashJoin.maxConcurrentPartitions")
diff --git a/docs/source/contributor-guide/grace-hash-join-design.md b/docs/source/contributor-guide/grace-hash-join-design.md
@@ -34,7 +34,7 @@ Supports all join types: Inner, Left, Right, Full, LeftSemi, LeftAnti, LeftMark,
 | `spark.comet.exec.replaceSortMergeJoin`                  | boolean | `false`     | Replace SortMergeJoin with ShuffledHashJoin (enables GHJ)  |
 | `spark.comet.exec.replaceSortMergeJoin.maxBuildSize`     | long    | `104857600` | Max build-side bytes for SMJ replacement. `-1` = no limit  |
 | `spark.comet.exec.graceHashJoin.numPartitions`           | int     | `16`        | Number of hash partitions (buckets)                        |
-| `spark.comet.exec.graceHashJoin.fastPathThreshold`       | long    | `67108864`  | Executor-wide fast-path budget in bytes (divided by cores) |
+| `spark.comet.exec.graceHashJoin.fastPathThreshold`       | long    | `536870912` | Executor-wide fast-path budget in bytes (divided by cores) |
 | `spark.comet.exec.graceHashJoin.maxConcurrentPartitions` | int     | `2`         | Max partitions joined in parallel during Phase 3           |
 
 ### SMJ Replacement Guard
diff --git a/native/core/src/execution/operators/grace_hash_join/exec.rs b/native/core/src/execution/operators/grace_hash_join/exec.rs
@@ -740,27 +740,9 @@ fn join_with_spilled_probe(
     // Skip if build side is empty and join type requires it
     let build_empty = build_batches.is_empty();
     let skip = match join_type {
-        JoinType::Inner | JoinType::LeftSemi | JoinType::LeftAnti => {
-            if build_left {
-                build_empty
-            } else {
-                false // probe emptiness unknown without reading
-            }
-        }
-        JoinType::Left | JoinType::LeftMark => {
-            if build_left {
-                build_empty
-            } else {
-                false
-            }
-        }
-        JoinType::Right => {
-            if !build_left {
-                build_empty
-            } else {
-                false
-            }
-        }
+        JoinType::Inner | JoinType::LeftSemi | JoinType::LeftAnti if build_left => build_empty,
+        JoinType::Left | JoinType::LeftMark if build_left => build_empty,
+        JoinType::Right if !build_left => build_empty,
         _ => false,
     };
     if skip {
@@ -1077,7 +1059,7 @@ fn repartition_and_join(
     )?;
 
     // Recursively join each sub-partition
-    for (build_part, probe_part) in build_sub.into_iter().zip(probe_sub.into_iter()) {
+    for (build_part, probe_part) in build_sub.into_iter().zip(probe_sub) {
         join_partition_recursive(
             build_part,
             probe_part,
diff --git a/native/core/src/execution/operators/grace_hash_join/partition.rs b/native/core/src/execution/operators/grace_hash_join/partition.rs
@@ -27,7 +27,7 @@ use std::sync::Arc;
 use ahash::RandomState;
 use arrow::array::UInt32Array;
 use arrow::compute::take;
-use arrow::datatypes::SchemaRef;
+use arrow::datatypes::{DataType, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion::common::hash_utils::create_hashes;
 use datafusion::common::Result as DFResult;
@@ -242,8 +242,8 @@ pub(super) async fn buffer_build_optimistic(
     reservation: &mut MemoryReservation,
     metrics: &GraceHashJoinMetrics,
 ) -> DFResult<BuildBufferResult> {
+    let schema = input.schema();
     let mut batches = Vec::new();
-    let mut total_bytes = 0usize;
 
     while let Some(batch) = input.next().await {
         let batch = batch?;
@@ -254,6 +254,10 @@ pub(super) async fn buffer_build_optimistic(
         metrics.build_input_batches.add(1);
         metrics.build_input_rows.add(batch.num_rows());
 
+        // Per-batch `get_array_memory_size` is safe to use for `try_grow`
+        // because overestimating just makes us more conservative with memory
+        // pressure — it can only force us into the fallback path, never into
+        // a spurious OOM.
         let batch_size = batch.get_array_memory_size();
 
         if reservation.try_grow(batch_size).is_err() {
@@ -263,11 +267,94 @@ pub(super) async fn buffer_build_optimistic(
             return Ok(BuildBufferResult::NeedPartition(batches, input));
         }
 
-        total_bytes += batch_size;
         batches.push(batch);
     }
 
-    Ok(BuildBufferResult::Complete(batches, total_bytes))
+    // Compute a size estimate for the fast-path threshold check from schema +
+    // row count instead of `get_array_memory_size`. The latter reports the
+    // full underlying buffer for every zero-copy slice (common after shuffle),
+    // so a 49 MB build can look like 97 MB and spuriously fail the threshold.
+    let actual_bytes = approximate_memory_size(&batches, &schema);
+    Ok(BuildBufferResult::Complete(batches, actual_bytes))
+}
+
+/// Approximate in-memory size of a collection of record batches using the
+/// schema's per-column byte widths and a row count.
+///
+/// Used instead of `batch.get_array_memory_size()` for the fast-path threshold
+/// decision because the Arrow helper reports the full underlying buffer size
+/// for every zero-copy slice, inflating the number by the number of slices
+/// when batches come out of a shuffle read. A row-count × row-width estimate
+/// has no such cross-slice double-counting. It is approximate for
+/// variable-width columns (strings, binary) — we pick a conservative 32 bytes
+/// per row — but good enough to gate the coarse threshold check.
+fn approximate_memory_size(batches: &[RecordBatch], schema: &Schema) -> usize {
+    let row_size = approximate_row_size(schema);
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    total_rows * row_size
+}
+
+fn approximate_row_size(schema: &Schema) -> usize {
+    schema
+        .fields()
+        .iter()
+        .map(|f| approximate_type_size(f.data_type()))
+        .sum()
+}
+
+fn approximate_type_size(dt: &DataType) -> usize {
+    match dt {
+        DataType::Null => 0,
+        DataType::Boolean => 1,
+        DataType::Int8 | DataType::UInt8 => 1,
+        DataType::Int16 | DataType::UInt16 | DataType::Float16 => 2,
+        DataType::Int32
+        | DataType::UInt32
+        | DataType::Float32
+        | DataType::Date32
+        | DataType::Time32(_) => 4,
+        DataType::Int64
+        | DataType::UInt64
+        | DataType::Float64
+        | DataType::Date64
+        | DataType::Time64(_)
+        | DataType::Timestamp(_, _)
+        | DataType::Duration(_)
+        | DataType::Interval(_) => 8,
+        DataType::Decimal32(_, _) => 4,
+        DataType::Decimal64(_, _) => 8,
+        DataType::Decimal128(_, _) => 16,
+        DataType::Decimal256(_, _) => 32,
+        DataType::FixedSizeBinary(n) => *n as usize,
+        // Variable-width: pick a conservative average. Exact strings would
+        // need a scan over the offset buffer; good enough for a threshold
+        // gate that is itself a heuristic.
+        DataType::Binary
+        | DataType::LargeBinary
+        | DataType::BinaryView
+        | DataType::Utf8
+        | DataType::LargeUtf8
+        | DataType::Utf8View => 32,
+        DataType::List(f)
+        | DataType::LargeList(f)
+        | DataType::ListView(f)
+        | DataType::LargeListView(f)
+        | DataType::FixedSizeList(f, _) => 4 + approximate_type_size(f.data_type()),
+        DataType::Struct(fields) => fields
+            .iter()
+            .map(|f| approximate_type_size(f.data_type()))
+            .sum(),
+        DataType::Map(f, _) => 8 + approximate_type_size(f.data_type()),
+        DataType::Dictionary(key, value) => {
+            approximate_type_size(key) + approximate_type_size(value)
+        }
+        DataType::Union(fields, _) => fields
+            .iter()
+            .map(|(_, f)| approximate_type_size(f.data_type()))
+            .max()
+            .unwrap_or(8),
+        DataType::RunEndEncoded(_, values) => approximate_type_size(values.data_type()),
+    }
 }
 
 /// Partition already-buffered build batches into the partition structure.
@@ -815,3 +902,70 @@ pub(super) fn sub_partition_batches(
     }
     Ok(result)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Int32Array, StringArray};
+    use arrow::datatypes::{DataType, Field, Schema};
+
+    /// approximate_memory_size must be insensitive to zero-copy slicing -
+    /// a batch sliced into N pieces should report the same total as the
+    /// unsliced parent. A naive sum of get_array_memory_size would
+    /// inflate the number by N because each slice reports the full buffer.
+    #[test]
+    fn approximate_memory_size_is_slice_invariant() {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let values: Vec<i32> = (0..1000).collect();
+        let parent = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )
+        .unwrap();
+
+        // 1000 rows * 4 bytes/row = 4000
+        let parent_est = approximate_memory_size(std::slice::from_ref(&parent), &schema);
+        assert_eq!(parent_est, 4000);
+
+        let slices = vec![
+            parent.slice(0, 250),
+            parent.slice(250, 250),
+            parent.slice(500, 250),
+            parent.slice(750, 250),
+        ];
+        let sliced_est = approximate_memory_size(&slices, &schema);
+        assert_eq!(sliced_est, parent_est);
+
+        // Show the contrast with the naive per-batch get_array_memory_size sum.
+        let naive: usize = slices
+            .iter()
+            .flat_map(|b| b.columns().iter())
+            .map(|c| c.to_data().get_array_memory_size())
+            .sum();
+        assert!(
+            naive > parent_est * 2,
+            "naive sum inflates with slices (got {naive}, parent {parent_est})"
+        );
+    }
+
+    #[test]
+    fn approximate_memory_size_sums_independent_batches() {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let mk = |start: i32| {
+            let arr = Int32Array::from((start..start + 100).collect::<Vec<_>>());
+            RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(arr)]).unwrap()
+        };
+        let batches = vec![mk(0), mk(100), mk(200)];
+        // 3 * 100 rows * 4 bytes = 1200
+        assert_eq!(approximate_memory_size(&batches, &schema), 1200);
+    }
+
+    #[test]
+    fn approximate_memory_size_handles_strings() {
+        let schema = Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, false)]));
+        let arr = StringArray::from(vec!["a"; 100]);
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(arr)]).unwrap();
+        // 100 rows * 32 bytes/row (heuristic) = 3200
+        assert_eq!(approximate_memory_size(&[batch], &schema), 3200);
+    }
+}
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1715,7 +1715,7 @@ impl PhysicalPlanner {
                     let executor_cores = self.spark_conf.get_usize(SPARK_EXECUTOR_CORES, 1).max(1);
                     let total_fast_path_threshold = self
                         .spark_conf
-                        .get_usize(COMET_GRACE_HASH_JOIN_FAST_PATH_THRESHOLD, 64 * 1024 * 1024);
+                        .get_usize(COMET_GRACE_HASH_JOIN_FAST_PATH_THRESHOLD, 512 * 1024 * 1024);
                     let fast_path_threshold = total_fast_path_threshold / executor_cores;
                     let max_concurrent_partitions = self
                         .spark_conf