Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ clickbench_partitioned: ClickBench queries against partitioned (100 files) parqu
clickbench_pushdown: ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)

# Sort Pushdown Benchmarks
sort_pushdown: Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
sort_pushdown_sorted: Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files

# Sorted Data Benchmarks (ORDER BY Optimization)
clickbench_sorted: ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)

Expand Down Expand Up @@ -309,6 +313,10 @@ main() {
# same data as for tpch
data_tpch "1" "parquet"
;;
sort_pushdown|sort_pushdown_sorted)
# same data as for tpch
data_tpch "1" "parquet"
;;
Comment on lines +316 to +319
Copy link

Copilot AI Mar 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sort_pushdown|sort_pushdown_sorted reuses data_tpch "1" "parquet", which currently generates parquet with --parts=1. That means the benchmark will typically run against a single lineitem parquet file, not multiple non-overlapping files as described. If the intent is to benchmark cross-file sort elimination, consider generating TPCH parquet with multiple parts for this benchmark (and documenting that the files are expected to be sorted by l_orderkey).

Copilot uses AI. Check for mistakes.
sort_tpch)
# same data as for tpch
data_tpch "1" "parquet"
Expand Down Expand Up @@ -509,6 +517,12 @@ main() {
external_aggr)
run_external_aggr
;;
sort_pushdown)
run_sort_pushdown
;;
sort_pushdown_sorted)
run_sort_pushdown_sorted
;;
sort_tpch)
run_sort_tpch "1"
;;
Expand Down Expand Up @@ -1070,6 +1084,22 @@ run_external_aggr() {
debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the sort pushdown benchmark (without WITH ORDER)
run_sort_pushdown() {
TPCH_DIR="${DATA_DIR}/tpch_sf1"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
echo "Running sort pushdown benchmark (no WITH ORDER)..."
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}
Comment on lines +1090 to +1093
Copy link

Copilot AI Mar 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new sort-pushdown benchmark is described as single-partition in the module docs, but the bench runner doesn't pass --partitions here (so it will use DataFusion's default target partitions). If you want stable, comparable results and to match the documented plan shape (no SPM for single partition), consider passing --partitions 1 (or documenting the expected/default partitions) in these runner functions.

Copilot uses AI. Check for mistakes.

# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
run_sort_pushdown_sorted() {
TPCH_DIR="${DATA_DIR}/tpch_sf1"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
echo "Running sort pushdown benchmark (with WITH ORDER)..."
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}

# Runs the sort integration benchmark
run_sort_tpch() {
SCALE_FACTOR=$1
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/queries/sort_pushdown/q1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- Sort elimination: ORDER BY sort key ASC (full scan)
-- With --sorted: SortExec removed, sequential scan in file order
-- Without --sorted: full SortExec required
SELECT l_orderkey, l_partkey, l_suppkey
FROM lineitem
ORDER BY l_orderkey
7 changes: 7 additions & 0 deletions benchmarks/queries/sort_pushdown/q2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- Sort elimination + limit pushdown
-- With --sorted: SortExec removed + limit pushed to DataSourceExec
-- Without --sorted: TopK sort over all data
SELECT l_orderkey, l_partkey, l_suppkey
FROM lineitem
ORDER BY l_orderkey
LIMIT 100
5 changes: 5 additions & 0 deletions benchmarks/queries/sort_pushdown/q3.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Sort elimination: wide projection (all columns)
-- Tests sort elimination benefit with larger row payload
SELECT *
FROM lineitem
ORDER BY l_orderkey
5 changes: 5 additions & 0 deletions benchmarks/queries/sort_pushdown/q4.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Sort elimination + limit: wide projection
SELECT *
FROM lineitem
ORDER BY l_orderkey
LIMIT 100
5 changes: 4 additions & 1 deletion benchmarks/src/bin/dfbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;

use datafusion_benchmarks::{
cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_tpch, tpcds, tpch,
cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_pushdown, sort_tpch, tpcds,
tpch,
};

#[derive(Debug, Parser)]
Expand All @@ -53,6 +54,7 @@ enum Options {
Imdb(imdb::RunOpt),
Nlj(nlj::RunOpt),
Smj(smj::RunOpt),
SortPushdown(sort_pushdown::RunOpt),
SortTpch(sort_tpch::RunOpt),
Tpch(tpch::RunOpt),
Tpcds(tpcds::RunOpt),
Expand All @@ -72,6 +74,7 @@ pub async fn main() -> Result<()> {
Options::Imdb(opt) => Box::pin(opt.run()).await,
Options::Nlj(opt) => opt.run().await,
Options::Smj(opt) => opt.run().await,
Options::SortPushdown(opt) => opt.run().await,
Options::SortTpch(opt) => opt.run().await,
Options::Tpch(opt) => Box::pin(opt.run()).await,
Options::Tpcds(opt) => Box::pin(opt.run()).await,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub mod hj;
pub mod imdb;
pub mod nlj;
pub mod smj;
pub mod sort_pushdown;
pub mod sort_tpch;
pub mod tpcds;
pub mod tpch;
Expand Down
Loading
Loading