Compact more aggressively in TopK based upon memory usage (#20381)

cetra3 · web-flow · commit 3d43f560ab4b · 2026-05-19T20:31:11.000Z
## Which issue does this PR close? Sort of addresses #19386 ## Rationale for this change Compaction in TopK values currently uses some hard set heuristics to decide when to compact. Instead we can use the memory size of the batches as a bound. ## What changes are included in this PR? Adjusts TopK compaction to compact more aggressively, based upon memory size. ## Are these changes tested? Yes and a test has been added. ## Are there any user-facing changes? No ## Benchmarks I'm struggling (in this PR and other PRs...) to get some good reliable benchmarks through. However with this one it seems like it's *mostly* the same speed as `main` or a little faster in some cases: ``` + critcmp main topk_memory_batch group main topk_memory_batch ----- ---- ----------------- aggregate 10000000 time-series rows 1.05 45.5±2.13ms ? ?/sec 1.00 43.5±2.09ms ? ?/sec aggregate 10000000 worst-case rows 1.06 43.9±1.12ms ? ?/sec 1.00 41.5±1.03ms ? ?/sec distinct 10000000 rows asc [TopK] 1.00 4.3±0.25ms ? ?/sec 1.00 4.3±0.07ms ? ?/sec distinct 10000000 rows asc [no TopK] 1.00 42.9±1.83ms ? ?/sec 1.06 45.5±1.65ms ? ?/sec distinct 10000000 rows desc [TopK] 1.00 4.3±0.07ms ? ?/sec 1.01 4.3±0.06ms ? ?/sec distinct 10000000 rows desc [no TopK] 1.00 42.4±1.48ms ? ?/sec 1.05 44.7±1.88ms ? ?/sec top k=10 aggregate 10000000 time-series rows 1.13 10.6±0.54ms ? ?/sec 1.00 9.4±0.64ms ? ?/sec top k=10 aggregate 10000000 time-series rows [Utf8View] 1.08 11.0±0.65ms ? ?/sec 1.00 10.2±0.48ms ? ?/sec top k=10 aggregate 10000000 worst-case rows 1.04 16.8±1.47ms ? ?/sec 1.00 16.1±1.44ms ? ?/sec top k=10 aggregate 10000000 worst-case rows [Utf8View] 1.10 17.9±1.51ms ? ?/sec 1.00 16.2±1.16ms ? ?/sec top k=10 string aggregate 10000000 time-series rows [Utf8View] 1.11 9.1±0.36ms ? ?/sec 1.00 8.2±0.35ms ? ?/sec top k=10 string aggregate 10000000 time-series rows [Utf8] 1.12 7.8±0.48ms ? ?/sec 1.00 6.9±0.26ms ? ?/sec top k=10 string aggregate 10000000 worst-case rows [Utf8View] 1.06 8.6±0.21ms ? ?/sec 1.00 8.1±0.20ms ? ?/sec top k=10 string aggregate 10000000 worst-case rows [Utf8] 1.07 7.4±0.17ms ? ?/sec 1.00 6.9±0.13ms ? ?/sec ```
diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
@@ -221,7 +221,7 @@ impl TopK {
             expr,
             row_converter,
             scratch_rows,
-            heap: TopKHeap::new(k, batch_size),
+            heap: TopKHeap::new(k),
             common_sort_prefix_converter: prefix_row_converter,
             common_sort_prefix: Arc::from(common_sort_prefix),
             finished: false,
@@ -663,8 +663,6 @@ impl TopKMetrics {
 struct TopKHeap {
     /// The maximum number of elements to store in this heap.
     k: usize,
-    /// The target number of rows for output batches
-    batch_size: usize,
     /// Storage for up at most `k` items using a BinaryHeap. Reversed
     /// so that the smallest k so far is on the top
     inner: BinaryHeap<TopKRow>,
@@ -675,11 +673,10 @@ struct TopKHeap {
 }
 
 impl TopKHeap {
-    fn new(k: usize, batch_size: usize) -> Self {
+    fn new(k: usize) -> Self {
         assert!(k > 0);
         Self {
             k,
-            batch_size,
             inner: BinaryHeap::new(),
             store: RecordBatchStore::new(),
             owned_bytes: 0,
@@ -792,24 +789,26 @@ impl TopKHeap {
     /// Compact this heap, rewriting all stored batches into a single
     /// input batch
     pub fn maybe_compact(&mut self) -> Result<()> {
-        // we compact if the number of "unused" rows in the store is
-        // past some pre-defined threshold. Target holding up to
-        // around 20 batches, but handle cases of large k where some
-        // batches might be partially full
-        let max_unused_rows = (20 * self.batch_size) + self.k;
-        let unused_rows = self.store.unused_rows();
-
-        // don't compact if the store has one extra batch or
-        // unused rows is under the threshold
-        if self.store.len() <= 2 || unused_rows < max_unused_rows {
+        // Don't compact if there's only one batch (compacting into itself is pointless)
+        if self.store.len() <= 1 {
+            return Ok(());
+        }
+
+        let total_rows = self.store.total_rows;
+        let num_rows = self.inner.len();
+
+        // Compact when current store memory exceeds 2x what the compacted
+        // result would need. The multiplier avoids compacting when the
+        // savings would be marginal.
+        if total_rows <= num_rows * 2 {
             return Ok(());
         }
+
         // at first, compact the entire thing always into a new batch
         // (maybe we can get fancier in the future about ignoring
         // batches that have a high usage ratio already
 
         // Note: new batch is in the same order as inner
-        let num_rows = self.inner.len();
         let (new_batch, mut topk_rows) = self.emit_with_state()?;
         let Some(new_batch) = new_batch else {
             return Ok(());
@@ -969,6 +968,8 @@ struct RecordBatchStore {
     batches: HashMap<u32, RecordBatchEntry>,
     /// total size of all record batches tracked by this store
     batches_size: usize,
+    /// row count of all the batches
+    total_rows: usize,
 }
 
 impl RecordBatchStore {
@@ -977,6 +978,7 @@ impl RecordBatchStore {
             next_id: 0,
             batches: HashMap::new(),
             batches_size: 0,
+            total_rows: 0,
         }
     }
 
@@ -994,6 +996,7 @@ impl RecordBatchStore {
         // uses of 0 means that none of the rows in the batch were stored in the topk
         if entry.uses > 0 {
             self.batches_size += get_record_batch_memory_size(&entry.batch);
+            self.total_rows += entry.batch.num_rows();
             self.batches.insert(entry.id, entry);
         }
     }
@@ -1002,6 +1005,7 @@ impl RecordBatchStore {
     fn clear(&mut self) {
         self.batches.clear();
         self.batches_size = 0;
+        self.total_rows = 0;
     }
 
     fn get(&self, id: u32) -> Option<&RecordBatchEntry> {
@@ -1013,15 +1017,6 @@ impl RecordBatchStore {
         self.batches.len()
     }
 
-    /// Returns the total number of rows in batches minus the number
-    /// which are in use
-    fn unused_rows(&self) -> usize {
-        self.batches
-            .values()
-            .map(|batch_entry| batch_entry.batch.num_rows() - batch_entry.uses)
-            .sum()
-    }
-
     /// returns true if the store has nothing stored
     fn is_empty(&self) -> bool {
         self.batches.is_empty()
@@ -1045,6 +1040,11 @@ impl RecordBatchStore {
                 .batches_size
                 .checked_sub(get_record_batch_memory_size(&old_entry.batch))
                 .unwrap();
+
+            self.total_rows = self
+                .total_rows
+                .checked_sub(old_entry.batch.num_rows())
+                .unwrap();
         }
     }
 
@@ -1060,7 +1060,7 @@ impl RecordBatchStore {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Float64Array, Int32Array};
+    use arrow::array::{BooleanArray, Float64Array, Int32Array};
     use arrow::datatypes::{DataType, Field, Schema};
     use arrow_schema::SortOptions;
     use datafusion_common::assert_batches_eq;
@@ -1243,4 +1243,184 @@ mod tests {
 
         Ok(())
     }
+
+    /// Tests that memory-based compaction triggers when a large batch
+    /// has very few rows referenced by the top-k heap.
+    #[tokio::test]
+    async fn test_topk_memory_compaction() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let sort_expr = PhysicalSortExpr {
+            expr: col("a", schema.as_ref())?,
+            options: SortOptions::default(),
+        };
+
+        let full_expr = LexOrdering::from([sort_expr.clone()]);
+        let prefix = vec![sort_expr];
+
+        let runtime = Arc::new(RuntimeEnv::default());
+        let metrics = ExecutionPlanMetricsSet::new();
+
+        let k = 5;
+        let mut topk = TopK::try_new(
+            0,
+            Arc::clone(&schema),
+            prefix,
+            full_expr,
+            k,
+            8192,
+            runtime,
+            &metrics,
+            Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
+                DynamicFilterPhysicalExpr::new(vec![], lit(true)),
+            )))),
+        )?;
+
+        // Insert a large batch (100,000 rows) with values 1..=100_000.
+        // Only the smallest 5 values (1..=5) will end up in the heap.
+        let large_values: Vec<i32> = (1..=100_000).collect();
+        let array1: ArrayRef = Arc::new(Int32Array::from(large_values));
+        let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![array1])?;
+        topk.insert_batch(batch1)?;
+
+        // After the first batch, store has 1 batch — compaction should
+        // not trigger (guard: store.len() <= 1).
+        assert_eq!(
+            topk.heap.store.len(),
+            1,
+            "should have 1 batch before second insert"
+        );
+
+        // Insert a second batch whose values displace entries in the heap.
+        // -1 and 0 are smaller than the current top-5 (1..=5), so they
+        // produce 2 replacements. With replacements > 0, `insert_batch`
+        // calls `insert_batch_entry` (briefly making store.len() == 2)
+        // and then `maybe_compact`, which should collapse it back to 1.
+        let array2: ArrayRef = Arc::new(Int32Array::from(vec![-1, 0]));
+        let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![array2])?;
+        let replacements_before = topk.metrics.row_replacements.value();
+        topk.insert_batch(batch2)?;
+
+        // Sanity check: batch2 was actually integrated. Without
+        // replacements, `maybe_compact` is never called and the
+        // store-length assertion below would pass vacuously.
+        assert!(
+            topk.metrics.row_replacements.value() > replacements_before,
+            "batch2 must produce replacements so compaction is exercised"
+        );
+
+        // The compacted-estimate guard is `total_rows <= num_rows * 2`,
+        // i.e. 100_002 <= 10, which is false, so compaction fires and
+        // collapses the two stored batches back into one.
+        assert_eq!(
+            topk.heap.store.len(),
+            1,
+            "store should be compacted to 1 batch"
+        );
+
+        // Verify the emitted results are correct (top 5 ascending).
+        let results: Vec<_> = topk.emit()?.try_collect().await?;
+        assert_batches_eq!(
+            &[
+                "+----+", "| a  |", "+----+", "| -1 |", "| 0  |", "| 1  |", "| 2  |",
+                "| 3  |", "+----+",
+            ],
+            &results
+        );
+
+        Ok(())
+    }
+
+    /// Negative path: when stored rows are close to the heap size,
+    /// compaction must NOT fire even with multiple batches present,
+    /// because the savings would be marginal
+    /// (guard: `total_rows <= num_rows * 2`).
+    ///
+    /// Uses a bit-packed `BooleanArray` so that future changes to the
+    /// compaction heuristic that reintroduce a per-byte estimate
+    /// (where integer truncation could misbehave on sub-byte types)
+    /// are caught here.
+    #[tokio::test]
+    async fn test_topk_memory_compaction_skipped_when_marginal() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)]));
+
+        let sort_expr = PhysicalSortExpr {
+            expr: col("a", schema.as_ref())?,
+            options: SortOptions::default(),
+        };
+        let full_expr = LexOrdering::from([sort_expr.clone()]);
+        let prefix = vec![sort_expr];
+
+        let runtime = Arc::new(RuntimeEnv::default());
+        let metrics = ExecutionPlanMetricsSet::new();
+
+        let k = 10;
+        let mut topk = TopK::try_new(
+            0,
+            Arc::clone(&schema),
+            prefix,
+            full_expr,
+            k,
+            8192,
+            runtime,
+            &metrics,
+            Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
+                DynamicFilterPhysicalExpr::new(vec![], lit(true)),
+            )))),
+        )?;
+
+        // Two small batches; every row from both batches ends up referenced
+        // by the heap, so total_rows == num_rows == 10.
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(BooleanArray::from(vec![false, false, true, true, true]))
+                    as ArrayRef,
+            ],
+        )?;
+        topk.insert_batch(batch1)?;
+
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(BooleanArray::from(vec![false, false, false, true, true]))
+                    as ArrayRef,
+            ],
+        )?;
+        topk.insert_batch(batch2)?;
+
+        // Guard `total_rows <= num_rows * 2` should hold (10 <= 20),
+        // so compaction is skipped and BOTH batches remain in the store.
+        assert_eq!(
+            topk.heap.store.len(),
+            2,
+            "store must keep 2 batches when savings would be marginal"
+        );
+        assert_eq!(topk.heap.inner.len(), 10, "heap should hold all 10 rows");
+
+        // Output is still correct (5 falses then 5 trues ascending).
+        let results: Vec<_> = topk.emit()?.try_collect().await?;
+        assert_batches_eq!(
+            &[
+                "+-------+",
+                "| a     |",
+                "+-------+",
+                "| false |",
+                "| false |",
+                "| false |",
+                "| false |",
+                "| false |",
+                "| true  |",
+                "| true  |",
+                "| true  |",
+                "| true  |",
+                "| true  |",
+                "+-------+",
+            ],
+            &results
+        );
+
+        Ok(())
+    }
 }