@@ -314,8 +314,7 @@ main() {
314314 data_tpch " 1" " parquet"
315315 ;;
316316 sort_pushdown|sort_pushdown_sorted)
317- # same data as for tpch
318- data_tpch " 1" " parquet"
317+ data_sort_pushdown
319318 ;;
320319 sort_tpch)
321320 # same data as for tpch
@@ -1085,19 +1084,80 @@ run_external_aggr() {
10851084}
10861085
10871086# Runs the sort pushdown benchmark (without WITH ORDER)
1088- run_sort_pushdown () {
1087+ # Generates sort pushdown benchmark data: multiple sorted parquet files with
1088+ # reversed naming so alphabetical order does NOT match sort key order.
1089+ # This forces the sort pushdown optimizer to reorder files by statistics.
1090+ #
1091+ # Files created (l_orderkey ranges):
1092+ # c_high.parquet: 1-200000 (c sorts last alphabetically, but has lowest keys)
1093+ # b_mid.parquet: 200001-400000
1094+ # a_low.parquet: 400001-600001 (a sorts first alphabetically, but has highest keys)
1095+ data_sort_pushdown () {
1096+ SORT_PUSHDOWN_DIR=" ${DATA_DIR} /sort_pushdown/lineitem"
1097+ if [ -d " ${SORT_PUSHDOWN_DIR} " ] && [ " $( ls -A ${SORT_PUSHDOWN_DIR} /* .parquet 2> /dev/null) " ]; then
1098+ echo " Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR} "
1099+ return
1100+ fi
1101+
1102+ echo " Generating sort pushdown benchmark data..."
1103+
1104+ # First ensure we have TPC-H data to work with
1105+ data_tpch " 1" " parquet"
1106+
10891107 TPCH_DIR=" ${DATA_DIR} /tpch_sf1"
1108+ mkdir -p " ${SORT_PUSHDOWN_DIR} "
1109+
1110+ # Build datafusion-cli if needed
1111+ $CARGO_COMMAND --bin datafusion-cli
1112+
1113+ DATAFUSION_CLI=" ./target/release/datafusion-cli"
1114+ if [ ! -f " $DATAFUSION_CLI " ]; then
1115+ DATAFUSION_CLI=" ./target/debug/datafusion-cli"
1116+ fi
1117+
1118+ echo " Creating sorted parquet files with reversed naming..."
1119+
1120+ $DATAFUSION_CLI --command "
1121+ CREATE EXTERNAL TABLE lineitem
1122+ STORED AS PARQUET
1123+ LOCATION '${TPCH_DIR} /lineitem/';
1124+
1125+ COPY (
1126+ SELECT * FROM lineitem
1127+ WHERE l_orderkey <= 200000
1128+ ORDER BY l_orderkey ASC
1129+ ) TO '${SORT_PUSHDOWN_DIR} /c_high.parquet';
1130+
1131+ COPY (
1132+ SELECT * FROM lineitem
1133+ WHERE l_orderkey > 200000 AND l_orderkey <= 400000
1134+ ORDER BY l_orderkey ASC
1135+ ) TO '${SORT_PUSHDOWN_DIR} /b_mid.parquet';
1136+
1137+ COPY (
1138+ SELECT * FROM lineitem
1139+ WHERE l_orderkey > 400000
1140+ ORDER BY l_orderkey ASC
1141+ ) TO '${SORT_PUSHDOWN_DIR} /a_low.parquet';
1142+ "
1143+
1144+ echo " Sort pushdown data generated at ${SORT_PUSHDOWN_DIR} "
1145+ ls -la " ${SORT_PUSHDOWN_DIR} "
1146+ }
1147+
1148+ run_sort_pushdown () {
1149+ SORT_PUSHDOWN_DIR=" ${DATA_DIR} /sort_pushdown"
10901150 RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown.json"
10911151 echo " Running sort pushdown benchmark (no WITH ORDER)..."
1092- debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1152+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path " ${SORT_PUSHDOWN_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
10931153}
10941154
10951155# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
10961156run_sort_pushdown_sorted () {
1097- TPCH_DIR =" ${DATA_DIR} /tpch_sf1 "
1157+ SORT_PUSHDOWN_DIR =" ${DATA_DIR} /sort_pushdown "
10981158 RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown_sorted.json"
10991159 echo " Running sort pushdown benchmark (with WITH ORDER)..."
1100- debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1160+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${SORT_PUSHDOWN_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
11011161}
11021162
11031163# Runs the sort integration benchmark
0 commit comments