apache
diff --git a/‎datafusion/core/benches/topk_aggregate.rs‎
Lines changed: 183 additions & 0 deletions b/‎datafusion/core/benches/topk_aggregate.rs‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎datafusion/core/src/physical_planner.rs‎
Lines changed: 12 additions & 7 deletions b/‎datafusion/core/src/physical_planner.rs‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎datafusion/core/tests/execution/coop.rs‎
Lines changed: 3 additions & 2 deletions b/‎datafusion/core/tests/execution/coop.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎datafusion/core/tests/parquet/mod.rs‎
Lines changed: 3 additions & 3 deletions b/‎datafusion/core/tests/parquet/mod.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs‎
Lines changed: 2 additions & 2 deletions b/‎datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs‎
Lines changed: 11 additions & 9 deletions b/‎datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs‎
Lines changed: 11 additions & 9 deletions
@@ -17,19 +17,63 @@
 
 mod data_utils;
 
+use arrow::array::Int64Builder;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
 use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::make_data;
 use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use datafusion_execution::config::SessionConfig;
+use rand::SeedableRng;
+use rand::seq::SliceRandom;
 use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
 const LIMIT: usize = 10;
 
+/// Create deterministic data for DISTINCT benchmarks with predictable trace_ids
+/// This ensures consistent results across benchmark runs
+fn make_distinct_data(
+    partition_cnt: i32,
+    sample_cnt: i32,
+) -> Result<(Arc<Schema>, Vec<Vec<RecordBatch>>)> {
+    let mut rng = rand::rngs::SmallRng::from_seed([42; 32]);
+    let total_samples = partition_cnt as usize * sample_cnt as usize;
+    let mut ids = Vec::new();
+    for i in 0..total_samples {
+        ids.push(i as i64);
+    }
+    ids.shuffle(&mut rng);
+
+    let mut global_idx = 0;
+    let schema = test_distinct_schema();
+    let mut partitions = vec![];
+    for _ in 0..partition_cnt {
+        let mut id_builder = Int64Builder::new();
+
+        for _ in 0..sample_cnt {
+            let id = ids[global_idx];
+            id_builder.append_value(id);
+            global_idx += 1;
+        }
+
+        let id_col = Arc::new(id_builder.finish());
+        let batch = RecordBatch::try_new(schema.clone(), vec![id_col])?;
+        partitions.push(vec![batch]);
+    }
+
+    Ok((schema, partitions))
+}
+
+/// Returns a Schema for distinct benchmarks with i64 trace_id
+fn test_distinct_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]))
+}
+
 async fn create_context(
     partition_cnt: i32,
     sample_cnt: i32,
@@ -50,6 +94,25 @@ async fn create_context(
     Ok(ctx)
 }
 
+async fn create_context_distinct(
+    partition_cnt: i32,
+    sample_cnt: i32,
+    use_topk: bool,
+) -> Result<SessionContext> {
+    // Use deterministic data generation for DISTINCT queries to ensure consistent results
+    let (schema, parts) = make_distinct_data(partition_cnt, sample_cnt).unwrap();
+    let mem_table = Arc::new(MemTable::try_new(schema, parts).unwrap());
+
+    // Create the DataFrame
+    let mut cfg = SessionConfig::new();
+    let opts = cfg.options_mut();
+    opts.optimizer.enable_topk_aggregation = use_topk;
+    let ctx = SessionContext::new_with_config(cfg);
+    let _ = ctx.register_table("traces", mem_table)?;
+
+    Ok(ctx)
+}
+
 fn run(rt: &Runtime, ctx: SessionContext, limit: usize, use_topk: bool, asc: bool) {
     black_box(rt.block_on(async { aggregate(ctx, limit, use_topk, asc).await })).unwrap();
 }
@@ -59,6 +122,17 @@ fn run_string(rt: &Runtime, ctx: SessionContext, limit: usize, use_topk: bool) {
         .unwrap();
 }
 
+fn run_distinct(
+    rt: &Runtime,
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+    asc: bool,
+) {
+    black_box(rt.block_on(async { aggregate_distinct(ctx, limit, use_topk, asc).await }))
+        .unwrap();
+}
+
 async fn aggregate(
     ctx: SessionContext,
     limit: usize,
@@ -133,6 +207,84 @@ async fn aggregate_string(
     Ok(())
 }
 
+async fn aggregate_distinct(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+    asc: bool,
+) -> Result<()> {
+    let order_direction = if asc { "asc" } else { "desc" };
+    let sql = format!(
+        "select id from traces group by id order by id {order_direction} limit {limit};"
+    );
+    let df = ctx.sql(sql.as_str()).await?;
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
+    assert_eq!(
+        actual_phys_plan.contains(&format!("lim=[{limit}]")),
+        use_topk
+    );
+    let batches = collect(plan, ctx.task_ctx()).await?;
+    assert_eq!(batches.len(), 1);
+    let batch = batches.first().unwrap();
+    assert_eq!(batch.num_rows(), LIMIT);
+
+    let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase();
+
+    let expected_asc = r#"
++----+
+| id |
++----+
+| 0  |
+| 1  |
+| 2  |
+| 3  |
+| 4  |
+| 5  |
+| 6  |
+| 7  |
+| 8  |
+| 9  |
++----+
+"#
+    .trim();
+
+    let expected_desc = r#"
++---------+
+| id      |
++---------+
+| 9999999 |
+| 9999998 |
+| 9999997 |
+| 9999996 |
+| 9999995 |
+| 9999994 |
+| 9999993 |
+| 9999992 |
+| 9999991 |
+| 9999990 |
++---------+
+"#
+    .trim();
+
+    // Verify exact results match expected values
+    if asc {
+        assert_eq!(
+            actual.trim(),
+            expected_asc,
+            "Ascending DISTINCT results do not match expected values"
+        );
+    } else {
+        assert_eq!(
+            actual.trim(),
+            expected_desc,
+            "Descending DISTINCT results do not match expected values"
+        );
+    }
+
+    Ok(())
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let rt = Runtime::new().unwrap();
     let limit = LIMIT;
@@ -253,6 +405,37 @@ fn criterion_benchmark(c: &mut Criterion) {
         .as_str(),
         |b| b.iter(|| run_string(&rt, ctx.clone(), limit, true)),
     );
+
+    // DISTINCT benchmarks
+    let ctx = rt.block_on(async {
+        create_context_distinct(partitions, samples, false)
+            .await
+            .unwrap()
+    });
+    c.bench_function(
+        format!("distinct {} rows desc [no TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx.clone(), limit, false, false)),
+    );
+
+    c.bench_function(
+        format!("distinct {} rows asc [no TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx.clone(), limit, false, true)),
+    );
+
+    let ctx_topk = rt.block_on(async {
+        create_context_distinct(partitions, samples, true)
+            .await
+            .unwrap()
+    });
+    c.bench_function(
+        format!("distinct {} rows desc [TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx_topk.clone(), limit, true, false)),
+    );
+
+    c.bench_function(
+        format!("distinct {} rows asc [TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx_topk.clone(), limit, true, true)),
+    );
 }
 
 criterion_group!(benches, criterion_benchmark);
 
@@ -39,7 +39,7 @@ use crate::physical_expr::{create_physical_expr, create_physical_exprs};
 use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
 use crate::physical_plan::analyze::AnalyzeExec;
 use crate::physical_plan::explain::ExplainExec;
-use crate::physical_plan::filter::FilterExec;
+use crate::physical_plan::filter::FilterExecBuilder;
 use crate::physical_plan::joins::utils as join_utils;
 use crate::physical_plan::joins::{
     CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
@@ -938,8 +938,12 @@ impl DefaultPhysicalPlanner {
                     input_schema.as_arrow(),
                 )? {
                     PlanAsyncExpr::Sync(PlannedExprResult::Expr(runtime_expr)) => {
-                        FilterExec::try_new(Arc::clone(&runtime_expr[0]), physical_input)?
-                            .with_batch_size(session_state.config().batch_size())?
+                        FilterExecBuilder::new(
+                            Arc::clone(&runtime_expr[0]),
+                            physical_input,
+                        )
+                        .with_batch_size(session_state.config().batch_size())
+                        .build()?
                     }
                     PlanAsyncExpr::Async(
                         async_map,
@@ -949,16 +953,17 @@ impl DefaultPhysicalPlanner {
                             async_map.async_exprs,
                             physical_input,
                         )?;
-                        FilterExec::try_new(
+                        FilterExecBuilder::new(
                             Arc::clone(&runtime_expr[0]),
                             Arc::new(async_exec),
-                        )?
+                        )
                         // project the output columns excluding the async functions
                         // The async functions are always appended to the end of the schema.
-                        .with_projection(Some(
+                        .apply_projection(Some(
                             (0..input.schema().fields().len()).collect(),
                         ))?
-                        .with_batch_size(session_state.config().batch_size())?
+                        .with_batch_size(session_state.config().batch_size())
+                        .build()?
                     }
                     _ => {
                         return internal_err!(
 
@@ -24,7 +24,7 @@ use datafusion::physical_expr::aggregate::AggregateExprBuilder;
 use datafusion::physical_plan;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion::physical_plan::execution_plan::Boundedness;
 use datafusion::prelude::SessionContext;
@@ -233,6 +233,7 @@ async fn agg_grouped_topk_yields(
     #[values(false, true)] pretend_infinite: bool,
 ) -> Result<(), Box<dyn Error>> {
     // build session
+
     let session_ctx = SessionContext::new();
 
     // set up a top-k aggregation
@@ -260,7 +261,7 @@ async fn agg_grouped_topk_yields(
             inf.clone(),
             inf.schema(),
         )?
-        .with_limit(Some(100)),
+        .with_limit_options(Some(LimitOptions::new(100))),
     );
 
     query_yields(aggr, session_ctx.task_ctx()).await
 
@@ -576,9 +576,9 @@ fn make_uint_batches(start: u8, end: u8) -> RecordBatch {
         Field::new("u64", DataType::UInt64, true),
     ]));
     let v8: Vec<u8> = (start..end).collect();
-    let v16: Vec<u16> = (start as _..end as _).collect();
-    let v32: Vec<u32> = (start as _..end as _).collect();
-    let v64: Vec<u64> = (start as _..end as _).collect();
+    let v16: Vec<u16> = (start as u16..end as u16).collect();
+    let v32: Vec<u32> = (start as u32..end as u32).collect();
+    let v64: Vec<u64> = (start as u64..end as u64).collect();
     RecordBatch::try_new(
         schema,
         vec![
 
@@ -37,7 +37,7 @@ use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_optimizer::combine_partial_final_agg::CombinePartialFinalAggregate;
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::repartition::RepartitionExec;
@@ -260,7 +260,7 @@ fn aggregations_with_limit_combined() -> datafusion_common::Result<()> {
             schema,
         )
         .unwrap()
-        .with_limit(Some(5)),
+        .with_limit_options(Some(LimitOptions::new(5))),
     );
     let plan: Arc<dyn ExecutionPlan> = final_agg;
     // should combine the Partial/Final AggregateExecs to a Single AggregateExec
 
@@ -58,7 +58,7 @@ use datafusion_physical_plan::{
     aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
     coalesce_partitions::CoalescePartitionsExec,
     collect,
-    filter::FilterExec,
+    filter::{FilterExec, FilterExecBuilder},
     repartition::RepartitionExec,
     sorts::sort::SortExec,
 };
@@ -480,9 +480,10 @@ fn test_filter_with_projection() {
     let projection = vec![1, 0];
     let predicate = col_lit_predicate("a", "foo", &schema());
     let plan = Arc::new(
-        FilterExec::try_new(predicate, Arc::clone(&scan))
+        FilterExecBuilder::new(predicate, Arc::clone(&scan))
+            .apply_projection(Some(projection))
             .unwrap()
-            .with_projection(Some(projection))
+            .build()
             .unwrap(),
     );
 
@@ -505,9 +506,10 @@ fn test_filter_with_projection() {
     let projection = vec![1];
     let predicate = col_lit_predicate("a", "foo", &schema());
     let plan = Arc::new(
-        FilterExec::try_new(predicate, scan)
+        FilterExecBuilder::new(predicate, scan)
+            .apply_projection(Some(projection))
             .unwrap()
-            .with_projection(Some(projection))
+            .build()
             .unwrap(),
     );
     insta::assert_snapshot!(
@@ -564,9 +566,9 @@ fn test_pushdown_through_aggregates_on_grouping_columns() {
     let scan = TestScanBuilder::new(schema()).with_support(true).build();
 
     let filter = Arc::new(
-        FilterExec::try_new(col_lit_predicate("a", "foo", &schema()), scan)
-            .unwrap()
+        FilterExecBuilder::new(col_lit_predicate("a", "foo", &schema()), scan)
             .with_batch_size(10)
+            .build()
             .unwrap(),
     );
 
@@ -596,9 +598,9 @@ fn test_pushdown_through_aggregates_on_grouping_columns() {
 
     let predicate = col_lit_predicate("b", "bar", &schema());
     let plan = Arc::new(
-        FilterExec::try_new(predicate, aggregate)
-            .unwrap()
+        FilterExecBuilder::new(predicate, aggregate)
             .with_batch_size(100)
+            .build()
             .unwrap(),
     );