fix: remaining Grace Hash Join correctness issues

andygrove · andygrove · commit 4401493f5f85 · 2026-05-02T09:51:58.000-06:00
- Divide spark.comet.exec.graceHashJoin.fastPathThreshold by
  spark.executor.cores in the planner so the configured value is an
  executor-wide budget rather than a per-task one. Without the division,
  N concurrent tasks could each independently take the fast path and
  cumulatively exceed the intended budget. Update the CometConf doc to
  match the design-doc semantics.

- Mix recursion_level through the golden-ratio constant when deriving
  the per-level hash seed. Plain XOR only flipped a few low bits between
  adjacent levels, letting ahash produce correlated distributions and
  undermining the recursion depth limit for skewed data.

- Generalize SpillReaderExec to accept both in-memory batches and
  multiple spill files, reading them sequentially into a single
  coalesced stream. Remove the eager-read fallback in
  join_with_spilled_probe so merged partitions always honor the
  streaming-probe design invariant.
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -309,11 +309,12 @@ object CometConf extends ShimCometConf {
     conf(s"$COMET_EXEC_CONFIG_PREFIX.graceHashJoin.fastPathThreshold")
       .category(CATEGORY_EXEC)
       .doc(
-        "Per-task memory budget in bytes for Grace Hash Join fast-path hash tables. " +
-          "When a build side fits in memory and is smaller than this threshold, " +
-          "the join executes as a single HashJoinExec without partitioning or spilling. " +
-          "Set to 0 to disable the fast path. Larger values risk OOM because HashJoinExec " +
-          "creates non-spillable hash tables.")
+        "Executor-wide memory budget in bytes for Grace Hash Join fast-path hash tables. " +
+          "The native planner divides this by spark.executor.cores so that each task's " +
+          "fast-path hash table stays within its fair share. When a task's build side fits " +
+          "in memory and is smaller than its share, the join executes as a single " +
+          "HashJoinExec without partitioning or spilling. Set to 0 to disable the fast path. " +
+          "Larger values risk OOM because HashJoinExec creates non-spillable hash tables.")
       .longConf
       .checkValue(v => v >= 0, "The fast path threshold must be non-negative.")
       .createWithDefault(64L * 1024 * 1024) // 64 MB
diff --git a/native/core/src/execution/operators/grace_hash_join/exec.rs b/native/core/src/execution/operators/grace_hash_join/exec.rs
@@ -823,28 +823,14 @@ fn join_with_spilled_probe(
     // Build side: StreamSourceExec to avoid BatchSplitStream splitting
     let build_source = memory_source_exec(build_data, build_schema)?;
 
-    // Probe side: streaming from spill file(s).
-    // With a single spill file and no in-memory batches, use the streaming
-    // SpillReaderExec. Otherwise read eagerly since the merged group sizes
-    // are bounded by TARGET_PARTITION_BUILD_SIZE.
-    let probe_source: Arc<dyn ExecutionPlan> =
-        if probe_spill_files.len() == 1 && probe_in_memory.is_empty() {
-            Arc::new(SpillReaderExec::new(
-                probe_spill_files.into_iter().next().unwrap(),
-                Arc::clone(probe_schema),
-            ))
-        } else {
-            let mut probe_batches = probe_in_memory;
-            for spill_file in &probe_spill_files {
-                probe_batches.extend(read_spilled_batches(spill_file)?);
-            }
-            let probe_data = if probe_batches.is_empty() {
-                vec![RecordBatch::new_empty(Arc::clone(probe_schema))]
-            } else {
-                vec![concat_batches(probe_schema, &probe_batches)?]
-            };
-            memory_source_exec(probe_data, probe_schema)?
-        };
+    // Probe side: streamed from spill files with any in-memory batches
+    // prepended. `SpillReaderExec` handles both sources uniformly so we
+    // never fall back to eagerly materializing a merged group's probe data.
+    let probe_source: Arc<dyn ExecutionPlan> = Arc::new(SpillReaderExec::new(
+        probe_in_memory,
+        probe_spill_files,
+        Arc::clone(probe_schema),
+    ));
 
     // HashJoinExec expects left=build in CollectLeft mode
     let (left_source, right_source) = if build_left {
@@ -855,14 +841,8 @@ fn join_with_spilled_probe(
 
     info!(
         "GraceHashJoin: SPILLED PROBE PATH creating HashJoinExec, \
-         build_left={}, build_size={}, probe_source={}",
-        build_left,
-        build_size,
-        if probe_spill_files_count == 1 {
-            "SpillReaderExec"
-        } else {
-            "StreamSourceExec"
-        },
+         build_left={}, build_size={}, probe_spill_files={}",
+        build_left, build_size, probe_spill_files_count,
     );
 
     let stream = execute_hash_join(
diff --git a/native/core/src/execution/operators/grace_hash_join/partition.rs b/native/core/src/execution/operators/grace_hash_join/partition.rs
@@ -45,12 +45,23 @@ use super::PROBE_PROGRESS_MILESTONE_ROWS;
 
 /// Random state for hashing join keys into partitions. Uses fixed seeds
 /// different from DataFusion's HashJoinExec to avoid correlation.
-/// The `recursion_level` is XORed into the seed so that recursive
-/// repartitioning uses different hash functions at each level.
+///
+/// Each recursion level must produce a well-separated distribution so that
+/// rows which collided at level `N` scatter across sub-partitions at level
+/// `N+1`. A naïve `seed ^ recursion_level` only flips a few low bits between
+/// adjacent levels — ahash's multiply-rotate mixer can produce correlated
+/// outputs from such similar seeds, which would undermine the recursion
+/// depth limit for skewed data.
+///
+/// We mix the level through the golden-ratio constant (the FxHash seed,
+/// 2^64 / phi) so successive levels produce seeds that differ in roughly
+/// half their bits.
 fn partition_random_state(recursion_level: usize) -> RandomState {
+    const PHI64: u64 = 0x9E3779B97F4A7C15;
+    let mix = (recursion_level as u64).wrapping_mul(PHI64);
     RandomState::with_seeds(
-        0x517cc1b727220a95 ^ (recursion_level as u64),
-        0x3a8b7c9d1e2f4056,
+        0x517cc1b727220a95 ^ mix,
+        0x3a8b7c9d1e2f4056 ^ mix.rotate_left(32),
         0,
         0,
     )
diff --git a/native/core/src/execution/operators/grace_hash_join/spill.rs b/native/core/src/execution/operators/grace_hash_join/spill.rs
@@ -111,26 +111,38 @@ impl SpillWriter {
 // SpillReaderExec: streaming ExecutionPlan for reading spill files
 // ---------------------------------------------------------------------------
 
-/// An ExecutionPlan that streams record batches from an Arrow IPC spill file.
-/// Used during the join phase so that spilled probe data is read on-demand
-/// instead of loaded entirely into memory.
+/// An `ExecutionPlan` that streams record batches from zero or more in-memory
+/// batches followed by zero or more Arrow IPC spill files. Used during the
+/// join phase so that spilled probe data is read on-demand instead of loaded
+/// entirely into memory.
+///
+/// All sources are concatenated into a single output stream with the same
+/// sub-batch coalescing applied uniformly.
 #[derive(Debug)]
 pub(super) struct SpillReaderExec {
-    spill_file: RefCountedTempFile,
+    /// Batches held in memory, emitted first.
+    initial_batches: Vec<RecordBatch>,
+    /// Spill files read sequentially after `initial_batches`.
+    spill_files: Vec<RefCountedTempFile>,
     schema: SchemaRef,
     cache: Arc<PlanProperties>,
 }
 
 impl SpillReaderExec {
-    pub(super) fn new(spill_file: RefCountedTempFile, schema: SchemaRef) -> Self {
+    pub(super) fn new(
+        initial_batches: Vec<RecordBatch>,
+        spill_files: Vec<RefCountedTempFile>,
+        schema: SchemaRef,
+    ) -> Self {
         let cache = Arc::new(PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&schema)),
             Partitioning::UnknownPartitioning(1),
             datafusion::physical_plan::execution_plan::EmissionType::Incremental,
             datafusion::physical_plan::execution_plan::Boundedness::Bounded,
         ));
         Self {
-            spill_file,
+            initial_batches,
+            spill_files,
             schema,
             cache,
         }
@@ -178,74 +190,100 @@ impl ExecutionPlan for SpillReaderExec {
     ) -> DFResult<SendableRecordBatchStream> {
         let stream_schema = Arc::clone(&self.schema);
         let coalesce_schema = Arc::clone(&self.schema);
-        let path = self.spill_file.path().to_path_buf();
-        // Move the spill file handle into the blocking closure to keep
-        // the temp file alive until the reader is done.
-        let spill_file_handle = self.spill_file.clone();
+        let initial_batches = self.initial_batches.clone();
+        // Clone the file handles so the blocking task owns references that
+        // keep the temp files alive until reading completes.
+        let spill_files: Vec<RefCountedTempFile> = self.spill_files.to_vec();
 
         // Use a channel so file I/O runs on a blocking thread and doesn't
         // block the async executor. This lets select_all interleave multiple
         // partition streams effectively.
         let (tx, rx) = mpsc::channel::<DFResult<RecordBatch>>(4);
 
         tokio::task::spawn_blocking(move || {
-            let _keep_alive = spill_file_handle;
-            let file = match File::open(&path) {
-                Ok(f) => f,
-                Err(e) => {
-                    let _ = tx.blocking_send(Err(DataFusionError::Execution(format!(
-                        "Failed to open spill file: {e}"
-                    ))));
-                    return;
-                }
-            };
-            let reader = match StreamReader::try_new(
-                BufReader::with_capacity(SPILL_IO_BUFFER_SIZE, file),
-                None,
-            ) {
-                Ok(r) => r,
-                Err(e) => {
-                    let _ = tx.blocking_send(Err(DataFusionError::ArrowError(Box::new(e), None)));
-                    return;
-                }
-            };
-
-            // Coalesce small sub-batches into larger ones to reduce per-batch
-            // overhead in the downstream hash join.
+            // Small sub-batches (~1000-row inputs split N ways produce ~1000/N
+            // row sub-batches) are coalesced into ~SPILL_READ_COALESCE_TARGET
+            // row batches to reduce per-batch overhead in the downstream join.
             let mut pending: Vec<RecordBatch> = Vec::new();
             let mut pending_rows = 0usize;
 
-            for batch_result in reader {
-                let batch = match batch_result {
-                    Ok(b) => b,
-                    Err(e) => {
-                        let _ =
-                            tx.blocking_send(Err(DataFusionError::ArrowError(Box::new(e), None)));
-                        return;
+            // Closure-free helper would complicate borrowing; inline the flush.
+            macro_rules! flush_if_ready {
+                () => {
+                    if pending_rows >= SPILL_READ_COALESCE_TARGET {
+                        let merged = if pending.len() == 1 {
+                            Ok(pending.pop().unwrap())
+                        } else {
+                            concat_batches(&coalesce_schema, &pending)
+                                .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+                        };
+                        pending.clear();
+                        pending_rows = 0;
+                        if tx.blocking_send(merged).is_err() {
+                            return;
+                        }
                     }
                 };
+            }
+
+            // Emit any in-memory batches first, applying the same coalescing.
+            for batch in initial_batches {
                 if batch.num_rows() == 0 {
                     continue;
                 }
                 pending_rows += batch.num_rows();
                 pending.push(batch);
+                flush_if_ready!();
+            }
 
-                if pending_rows >= SPILL_READ_COALESCE_TARGET {
-                    let merged = if pending.len() == 1 {
-                        Ok(pending.pop().unwrap())
-                    } else {
-                        concat_batches(&coalesce_schema, &pending)
-                            .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
-                    };
-                    pending.clear();
-                    pending_rows = 0;
-                    if tx.blocking_send(merged).is_err() {
+            // Then read each spill file sequentially.
+            for spill_file in &spill_files {
+                let file = match File::open(spill_file.path()) {
+                    Ok(f) => f,
+                    Err(e) => {
+                        let _ = tx.blocking_send(Err(DataFusionError::Execution(format!(
+                            "Failed to open spill file: {e}"
+                        ))));
+                        return;
+                    }
+                };
+                let reader = match StreamReader::try_new(
+                    BufReader::with_capacity(SPILL_IO_BUFFER_SIZE, file),
+                    None,
+                ) {
+                    Ok(r) => r,
+                    Err(e) => {
+                        let _ = tx.blocking_send(Err(DataFusionError::ArrowError(
+                            Box::new(e),
+                            None,
+                        )));
                         return;
                     }
+                };
+
+                for batch_result in reader {
+                    let batch = match batch_result {
+                        Ok(b) => b,
+                        Err(e) => {
+                            let _ = tx.blocking_send(Err(DataFusionError::ArrowError(
+                                Box::new(e),
+                                None,
+                            )));
+                            return;
+                        }
+                    };
+                    if batch.num_rows() == 0 {
+                        continue;
+                    }
+                    pending_rows += batch.num_rows();
+                    pending.push(batch);
+                    flush_if_ready!();
                 }
             }
+            // Keep the temp files alive until the reader is done.
+            drop(spill_files);
 
-            // Flush remaining
+            // Flush any remaining buffered batches.
             if !pending.is_empty() {
                 let merged = if pending.len() == 1 {
                     Ok(pending.pop().unwrap())
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1702,15 +1702,22 @@ impl PhysicalPlanner {
                     use crate::execution::spark_config::{
                         SparkConfig, COMET_GRACE_HASH_JOIN_FAST_PATH_THRESHOLD,
                         COMET_GRACE_HASH_JOIN_MAX_CONCURRENT_PARTITIONS,
-                        COMET_GRACE_HASH_JOIN_NUM_PARTITIONS,
+                        COMET_GRACE_HASH_JOIN_NUM_PARTITIONS, SPARK_EXECUTOR_CORES,
                     };
 
                     let num_partitions = self
                         .spark_conf
                         .get_usize(COMET_GRACE_HASH_JOIN_NUM_PARTITIONS, 16);
-                    let fast_path_threshold = self
+                    // The fast-path threshold is the *executor-wide* budget across all
+                    // concurrent tasks. Divide it by `spark.executor.cores` so each
+                    // task's fast-path hash table stays within its fair share and N
+                    // concurrent tasks don't collectively exceed the configured budget.
+                    let executor_cores =
+                        self.spark_conf.get_usize(SPARK_EXECUTOR_CORES, 1).max(1);
+                    let total_fast_path_threshold = self
                         .spark_conf
                         .get_usize(COMET_GRACE_HASH_JOIN_FAST_PATH_THRESHOLD, 64 * 1024 * 1024);
+                    let fast_path_threshold = total_fast_path_threshold / executor_cores;
                     let max_concurrent_partitions = self
                         .spark_conf
                         .get_usize(COMET_GRACE_HASH_JOIN_MAX_CONCURRENT_PARTITIONS, 2);