@@ -314,8 +314,7 @@ main() {
314314 data_tpch " 1" " parquet"
315315 ;;
316316 sort_pushdown|sort_pushdown_sorted)
317- # same data as for tpch
318- data_tpch " 1" " parquet"
317+ data_sort_pushdown
319318 ;;
320319 sort_tpch)
321320 # same data as for tpch
@@ -1085,19 +1084,57 @@ run_external_aggr() {
10851084}
10861085
10871086# Runs the sort pushdown benchmark (without WITH ORDER)
1087+ # Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
1088+ # renamed so alphabetical order does NOT match sort key order.
1089+ # This forces the sort pushdown optimizer to reorder files by statistics.
1090+ #
1091+ # tpchgen produces 3 sorted, non-overlapping parquet files:
1092+ # lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
1093+ # lineitem.2.parquet: l_orderkey 2M ~ 4M
1094+ # lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
1095+ #
1096+ # We rename them so alphabetical order is reversed:
1097+ # a_part3.parquet (highest keys, sorts first alphabetically)
1098+ # b_part2.parquet
1099+ # c_part1.parquet (lowest keys, sorts last alphabetically)
1100+ data_sort_pushdown () {
1101+ SORT_PUSHDOWN_DIR=" ${DATA_DIR} /sort_pushdown/lineitem"
1102+ if [ -d " ${SORT_PUSHDOWN_DIR} " ] && [ " $( ls -A ${SORT_PUSHDOWN_DIR} /* .parquet 2> /dev/null) " ]; then
1103+ echo " Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR} "
1104+ return
1105+ fi
1106+
1107+ echo " Generating sort pushdown benchmark data (3 parts with reversed naming)..."
1108+
1109+ TEMP_DIR=" ${DATA_DIR} /sort_pushdown_temp"
1110+ mkdir -p " ${TEMP_DIR} " " ${SORT_PUSHDOWN_DIR} "
1111+
1112+ tpchgen-cli --scale-factor 1 --format parquet --parquet-compression=' ZSTD(1)' --parts=3 --output-dir " ${TEMP_DIR} "
1113+
1114+ # Rename: reverse alphabetical order vs key order
1115+ mv " ${TEMP_DIR} /lineitem/lineitem.3.parquet" " ${SORT_PUSHDOWN_DIR} /a_part3.parquet"
1116+ mv " ${TEMP_DIR} /lineitem/lineitem.2.parquet" " ${SORT_PUSHDOWN_DIR} /b_part2.parquet"
1117+ mv " ${TEMP_DIR} /lineitem/lineitem.1.parquet" " ${SORT_PUSHDOWN_DIR} /c_part1.parquet"
1118+
1119+ rm -rf " ${TEMP_DIR} "
1120+
1121+ echo " Sort pushdown data generated at ${SORT_PUSHDOWN_DIR} "
1122+ ls -la " ${SORT_PUSHDOWN_DIR} "
1123+ }
1124+
10881125run_sort_pushdown () {
1089- TPCH_DIR =" ${DATA_DIR} /tpch_sf1 "
1126+ SORT_PUSHDOWN_DIR =" ${DATA_DIR} /sort_pushdown "
10901127 RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown.json"
10911128 echo " Running sort pushdown benchmark (no WITH ORDER)..."
1092- debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1129+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path " ${SORT_PUSHDOWN_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
10931130}
10941131
10951132# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
10961133run_sort_pushdown_sorted () {
1097- TPCH_DIR =" ${DATA_DIR} /tpch_sf1 "
1134+ SORT_PUSHDOWN_DIR =" ${DATA_DIR} /sort_pushdown "
10981135 RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown_sorted.json"
10991136 echo " Running sort pushdown benchmark (with WITH ORDER)..."
1100- debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${TPCH_DIR} " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1137+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${SORT_PUSHDOWN_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown " -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
11011138}
11021139
11031140# Runs the sort integration benchmark
0 commit comments