apache
diff --git a/‎datafusion/common/src/config.rs‎
Lines changed: 7 additions & 2 deletions b/‎datafusion/common/src/config.rs‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎datafusion/common/src/utils/mod.rs‎
Lines changed: 131 additions & 0 deletions b/‎datafusion/common/src/utils/mod.rs‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎datafusion/core/tests/dataframe/mod.rs‎
Lines changed: 3 additions & 3 deletions b/‎datafusion/core/tests/dataframe/mod.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎datafusion/core/tests/physical_optimizer/limit_pushdown.rs‎
Lines changed: 120 additions & 0 deletions b/‎datafusion/core/tests/physical_optimizer/limit_pushdown.rs‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎datafusion/ffi/src/physical_expr/partitioning.rs‎
Lines changed: 5 additions & 0 deletions b/‎datafusion/ffi/src/physical_expr/partitioning.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎datafusion/functions-aggregate/src/min_max/min_max_bytes.rs‎
Lines changed: 3 additions & 1 deletion b/‎datafusion/functions-aggregate/src/min_max/min_max_bytes.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎datafusion/functions-aggregate/src/min_max/min_max_struct.rs‎
Lines changed: 3 additions & 1 deletion b/‎datafusion/functions-aggregate/src/min_max/min_max_struct.rs‎
Lines changed: 3 additions & 1 deletion
@@ -1151,8 +1151,13 @@ config_namespace! {
         /// in parallel using the provided `target_partitions` level
         pub repartition_aggregations: bool, default = true
 
-        /// Minimum total files size in bytes to perform file scan repartitioning.
-        pub repartition_file_min_size: usize, default = 10 * 1024 * 1024
+        /// Minimum total file size in bytes for file-group byte-range
+        /// splitting to fire. Files (or merged file groups) smaller than this
+        /// stay as one partition. Lower values produce more, smaller
+        /// partitions — better at filling `target_partitions` worth of cores
+        /// when files are modestly sized, at the cost of slightly more
+        /// per-partition open / metadata-load overhead.
+        pub repartition_file_min_size: usize, default = 1024 * 1024
 
         /// Should DataFusion repartition data using the join keys to execute joins in parallel
         /// using the provided `target_partitions` level
 
@@ -395,6 +395,137 @@ pub fn longest_consecutive_prefix<T: Borrow<usize>>(
     count
 }
 
+/// Splits `vec` at index `n`, returning the first `n` elements and leaving the
+/// remaining `vec.len() - n` elements in `vec`.
+///
+/// Allocates for whichever side is smaller, so the new allocation is
+/// `min(n, vec.len() - n)` rather than always `n` (as `vec.drain(0..n).collect()`
+/// would). This matters when the split emits a prefix under memory pressure,
+/// where `n` can be close to `vec.len()`.
+pub fn split_vec_min_alloc<T>(vec: &mut Vec<T>, n: usize) -> Vec<T> {
+    if n * 2 <= vec.len() {
+        vec.drain(0..n).collect()
+    } else {
+        let remaining = vec.split_off(n);
+        std::mem::replace(vec, remaining)
+    }
+}
+
+#[cfg(test)]
+mod split_vec_min_alloc_tests {
+    use super::split_vec_min_alloc;
+
+    #[test]
+    fn drain_branch() {
+        // n * 2 <= len  ->  drain+collect branch (allocates n elements)
+        let mut v = vec![1, 2, 3, 4, 5, 6];
+        let first = split_vec_min_alloc(&mut v, 2);
+        assert_eq!(first, vec![1, 2]);
+        assert_eq!(v, vec![3, 4, 5, 6]);
+    }
+
+    #[test]
+    fn split_off_branch() {
+        // remaining < n  ->  split_off+replace branch (allocates remaining elements)
+        let mut v = vec![1, 2, 3, 4, 5, 6];
+        let first = split_vec_min_alloc(&mut v, 4);
+        assert_eq!(first, vec![1, 2, 3, 4]);
+        assert_eq!(v, vec![5, 6]);
+    }
+
+    #[test]
+    fn exactly_half() {
+        // n * 2 == len  ->  drain branch (boundary)
+        let mut v = vec![1, 2, 3, 4];
+        let first = split_vec_min_alloc(&mut v, 2);
+        assert_eq!(first, vec![1, 2]);
+        assert_eq!(v, vec![3, 4]);
+    }
+
+    #[test]
+    fn take_all() {
+        let mut v = vec![1, 2, 3];
+        let first = split_vec_min_alloc(&mut v, 3);
+        assert_eq!(first, vec![1, 2, 3]);
+        assert!(v.is_empty());
+    }
+
+    #[test]
+    fn take_none() {
+        let mut v = vec![1, 2, 3];
+        let first = split_vec_min_alloc(&mut v, 0);
+        assert!(first.is_empty());
+        assert_eq!(v, vec![1, 2, 3]);
+    }
+
+    #[test]
+    fn emitted_prefix_does_not_realloc_on_push() {
+        // Demonstrates *why* the split-off branch must NOT call `shrink_to_fit`.
+        //
+        // Downstream callers (e.g. `multi_group_by/bytes.rs`, which does
+        // `first_n_offsets.push(offset_n)` right after the split) push onto the
+        // emitted prefix immediately. The split-off branch hands the original
+        // backing allocation to that prefix, so the prefix already has spare
+        // capacity for the very next push.
+        //
+        // If we shrank the prefix to fit, that next push would have to
+        // reallocate, and Vec's growth strategy would land it at a *larger*
+        // capacity than the original allocation we started with -- the opposite
+        // of the memory saving `shrink_to_fit` was meant to deliver.
+
+        // A Vec with a known, deliberately large capacity. n*2 > len, so this
+        // takes the split-off branch.
+        let mut v: Vec<u32> = Vec::with_capacity(64);
+        v.extend(0..10);
+        let original_capacity = v.capacity();
+        assert!(original_capacity >= 64);
+
+        // Emit a prefix that is most of the Vec (n = 8, remaining = 2).
+        let mut prefix = split_vec_min_alloc(&mut v, 8);
+        assert_eq!(prefix, vec![0, 1, 2, 3, 4, 5, 6, 7]);
+
+        // The split-off branch moved the original backing store into `prefix`,
+        // so it keeps the original (large) capacity -- no shrink happened.
+        assert_eq!(
+            prefix.capacity(),
+            original_capacity,
+            "split-off branch must hand the original allocation to the prefix"
+        );
+
+        // The caller's very next operation: push one element onto the prefix.
+        prefix.push(99);
+
+        // Because the capacity was preserved, the push reused the existing
+        // allocation: post-push capacity is unchanged and still <= original.
+        // This is the realloc that `shrink_to_fit` would have forced.
+        assert_eq!(
+            prefix.capacity(),
+            original_capacity,
+            "push must reuse the preserved allocation (no realloc)"
+        );
+        assert!(prefix.capacity() <= original_capacity);
+
+        // Counter-demonstration: had we shrunk the prefix to fit (capacity 8),
+        // the same push would have reallocated. Vec doubles on growth, so the
+        // post-push capacity (16) ends up LARGER than where a length-8 prefix
+        // started -- and we paid a realloc for it.
+        let mut shrunk: Vec<u32> = prefix[..8].to_vec();
+        shrunk.shrink_to_fit();
+        let shrunk_capacity = shrink_then_push_capacity(&mut shrunk);
+        assert!(
+            shrunk_capacity > 8,
+            "shrink-to-fit then push reallocates to a larger capacity"
+        );
+    }
+
+    /// Helper for the counter-demonstration above: push one element and report
+    /// the resulting capacity.
+    fn shrink_then_push_capacity(v: &mut Vec<u32>) -> usize {
+        v.push(99);
+        v.capacity()
+    }
+}
+
 /// Creates single element [`ListArray`], [`LargeListArray`] and
 /// [`FixedSizeListArray`] from other arrays
 ///
 
@@ -840,7 +840,7 @@ async fn test_aggregate_with_pk() -> Result<()> {
     let aggr_expr = vec![];
     let df = df.aggregate(group_expr, aggr_expr)?;
 
-    // Since id and name are functionally dependant, we can use name among
+    // Since id and name are functionally dependent, we can use name among
     // expression even if it is not part of the group by expression and can
     // select "name" column even though it wasn't explicitly grouped
     let df = df.select(vec![col("id"), col("name")])?;
@@ -895,7 +895,7 @@ async fn test_aggregate_with_pk2() -> Result<()> {
     "
     );
 
-    // Since id and name are functionally dependant, we can use name among expression
+    // Since id and name are functionally dependent, we can use name among expression
     // even if it is not part of the group by expression.
     let df_results = df.collect().await?;
 
@@ -943,7 +943,7 @@ async fn test_aggregate_with_pk3() -> Result<()> {
     "
     );
 
-    // Since id and name are functionally dependant, we can use name among expression
+    // Since id and name are functionally dependent, we can use name among expression
     // even if it is not part of the group by expression.
     let df_results = df.collect().await?;
 
 
@@ -714,3 +714,123 @@ fn no_limit_preserves_plan_identity() -> Result<()> {
 
     Ok(())
 }
+
+#[test]
+fn outer_offset_does_not_leak_through_sort_into_inner_limit() -> Result<()> {
+    // Regression test for https://github.com/apache/datafusion/issues/22489
+    //
+    // When an outer OFFSET is separated from an inner LIMIT by a SortExec
+    // with different sort keys, the outer skip must not reduce the inner
+    // fetch. Before the fix, combine_limit merged them, producing
+    // GlobalLimitExec(skip=1, fetch=7) instead of preserving the inner
+    // LIMIT 8.
+    //
+    // Plan structure:
+    // GlobalLimitExec: skip=1, fetch=None        (outer OFFSET 1)
+    //   SortExec: [c1 DESC]                      (outer sort — different key)
+    //     GlobalLimitExec: skip=0, fetch=8        (inner LIMIT 8)
+    //       SortExec: [c2 ASC]                    (inner sort — different key)
+    //         EmptyExec
+    let schema = create_schema();
+    let empty = empty_exec(Arc::clone(&schema));
+
+    let inner_ordering: LexOrdering = [PhysicalSortExpr {
+        expr: col("c2", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+    let inner_sort = sort_exec(inner_ordering, empty);
+    let inner_limit = global_limit_exec(inner_sort, 0, Some(8));
+
+    let outer_ordering: LexOrdering = [PhysicalSortExpr {
+        expr: col("c1", &schema)?,
+        options: SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    }]
+    .into();
+    let outer_sort = sort_exec(outer_ordering, inner_limit);
+    let outer_limit = global_limit_exec(outer_sort, 1, None);
+
+    let initial = format_plan(&outer_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=1, fetch=None
+      SortExec: expr=[c1@0 DESC NULLS LAST], preserve_partitioning=[false]
+        GlobalLimitExec: skip=0, fetch=8
+          SortExec: expr=[c2@1 ASC], preserve_partitioning=[false]
+            EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(outer_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=1, fetch=None
+      SortExec: expr=[c1@0 DESC NULLS LAST], preserve_partitioning=[false]
+        SortExec: TopK(fetch=8), expr=[c2@1 ASC], preserve_partitioning=[false]
+          EmptyExec
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn outer_offset_with_same_sort_key_still_pushes_limit() -> Result<()> {
+    // Companion to outer_offset_does_not_leak_through_sort_into_inner_limit:
+    // when both sorts use the *same* key, the inner LIMIT should still be
+    // pushed into the SortExec as TopK.
+    //
+    // Plan structure:
+    // GlobalLimitExec: skip=1, fetch=None        (outer OFFSET 1)
+    //   SortExec: [c1 ASC]                       (outer sort — same key)
+    //     GlobalLimitExec: skip=0, fetch=8        (inner LIMIT 8)
+    //       SortExec: [c1 ASC]                    (inner sort — same key)
+    //         EmptyExec
+    let schema = create_schema();
+    let empty = empty_exec(Arc::clone(&schema));
+
+    let ordering: LexOrdering = [PhysicalSortExpr {
+        expr: col("c1", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+
+    let inner_sort = sort_exec(ordering.clone(), empty);
+    let inner_limit = global_limit_exec(inner_sort, 0, Some(8));
+    let outer_sort = sort_exec(ordering, inner_limit);
+    let outer_limit = global_limit_exec(outer_sort, 1, None);
+
+    let initial = format_plan(&outer_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=1, fetch=None
+      SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]
+        GlobalLimitExec: skip=0, fetch=8
+          SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]
+            EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(outer_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=1, fetch=None
+      SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]
+        SortExec: TopK(fetch=8), expr=[c1@0 ASC], preserve_partitioning=[false]
+          EmptyExec
+    "
+    );
+
+    Ok(())
+}
@@ -45,6 +45,11 @@ impl From<&Partitioning> for FFI_Partitioning {
                     .collect();
                 Self::Hash(exprs, *size)
             }
+            // FFI does not yet expose range partition metadata.
+            // See https://github.com/apache/datafusion/issues/22394
+            Partitioning::Range(range) => {
+                Self::UnknownPartitioning(range.partition_count())
+            }
             Partitioning::UnknownPartitioning(size) => Self::UnknownPartitioning(*size),
         }
     }
 
@@ -27,6 +27,8 @@ use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls:
 use std::mem::size_of;
 use std::sync::Arc;
 
+use datafusion_common::utils::split_vec_min_alloc;
+
 /// Implements fast Min/Max [`GroupsAccumulator`] for "bytes" types ([`StringArray`],
 /// [`BinaryArray`], [`StringViewArray`], etc)
 ///
@@ -493,7 +495,7 @@ impl MinMaxBytesState {
                 )
             }
             EmitTo::First(n) => {
-                let first_min_maxes: Vec<_> = self.min_max.drain(..n).collect();
+                let first_min_maxes = split_vec_min_alloc(&mut self.min_max, n);
                 let first_data_capacity: usize = first_min_maxes
                     .iter()
                     .map(|opt| opt.as_ref().map(|s| s.len()).unwrap_or(0))
 
@@ -30,6 +30,8 @@ use datafusion_common::{
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
 
+use datafusion_common::utils::split_vec_min_alloc;
+
 /// Accumulator for MIN/MAX operations on Struct data types.
 ///
 /// This accumulator tracks the minimum or maximum struct value encountered
@@ -282,7 +284,7 @@ impl MinMaxStructState {
                 )
             }
             EmitTo::First(n) => {
-                let first_min_maxes: Vec<_> = self.min_max.drain(..n).collect();
+                let first_min_maxes = split_vec_min_alloc(&mut self.min_max, n);
                 let first_data_capacity: usize = first_min_maxes
                     .iter()
                     .map(|opt| opt.as_ref().map(|s| s.len()).unwrap_or(0))
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,11 @@ impl From<&Partitioning> for FFI_Partitioning {`
`45`	`45`	`.collect();`
`46`	`46`	`Self::Hash(exprs, *size)`
`47`	`47`	`}`
	`48`	`+ // FFI does not yet expose range partition metadata.`
	`49`	`+ // See https://github.com/apache/datafusion/issues/22394`
	`50`	`+ Partitioning::Range(range) => {`
	`51`	`+ Self::UnknownPartitioning(range.partition_count())`
	`52`	`+ }`
`48`	`53`	`Partitioning::UnknownPartitioning(size) => Self::UnknownPartitioning(*size),`
`49`	`54`	`}`
`50`	`55`	`}`