Skip to content

Commit a732271

Browse files
committed
feat: generate reversed-name data for sort pushdown benchmark
Use tpchgen --parts=3 to create 3 sorted non-overlapping parquet files, then rename so alphabetical order is reversed vs key order: a_part3.parquet (highest keys, sorts first alphabetically) b_part2.parquet c_part1.parquet (lowest keys, sorts last alphabetically) This is much simpler than the previous approach (no datafusion-cli needed). Release benchmark results (6M rows, single partition): Q1 ORDER BY ASC: main 259ms → PR 122ms (53% faster) Q3 SELECT * ORDER BY: main 700ms → PR 353ms (50% faster)
1 parent 1e68674 commit a732271

File tree

1 file changed

+43
-6
lines changed

1 file changed

+43
-6
lines changed

benchmarks/bench.sh

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,7 @@ main() {
314314
data_tpch "1" "parquet"
315315
;;
316316
sort_pushdown|sort_pushdown_sorted)
317-
# same data as for tpch
318-
data_tpch "1" "parquet"
317+
data_sort_pushdown
319318
;;
320319
sort_tpch)
321320
# same data as for tpch
@@ -1085,19 +1084,57 @@ run_external_aggr() {
10851084
}
10861085

10871086
# Runs the sort pushdown benchmark (without WITH ORDER)
1087+
# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
1088+
# renamed so alphabetical order does NOT match sort key order.
1089+
# This forces the sort pushdown optimizer to reorder files by statistics.
1090+
#
1091+
# tpchgen produces 3 sorted, non-overlapping parquet files:
1092+
# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
1093+
# lineitem.2.parquet: l_orderkey 2M ~ 4M
1094+
# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
1095+
#
1096+
# We rename them so alphabetical order is reversed:
1097+
# a_part3.parquet (highest keys, sorts first alphabetically)
1098+
# b_part2.parquet
1099+
# c_part1.parquet (lowest keys, sorts last alphabetically)
1100+
data_sort_pushdown() {
1101+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
1102+
if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
1103+
echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
1104+
return
1105+
fi
1106+
1107+
echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..."
1108+
1109+
TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
1110+
mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"
1111+
1112+
tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"
1113+
1114+
# Rename: reverse alphabetical order vs key order
1115+
mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet"
1116+
mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet"
1117+
mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet"
1118+
1119+
rm -rf "${TEMP_DIR}"
1120+
1121+
echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
1122+
ls -la "${SORT_PUSHDOWN_DIR}"
1123+
}
1124+
10881125
run_sort_pushdown() {
1089-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1126+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10901127
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
10911128
echo "Running sort pushdown benchmark (no WITH ORDER)..."
1092-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1129+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
10931130
}
10941131

10951132
# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
10961133
run_sort_pushdown_sorted() {
1097-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1134+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10981135
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
10991136
echo "Running sort pushdown benchmark (with WITH ORDER)..."
1100-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1137+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
11011138
}
11021139

11031140
# Runs the sort integration benchmark

0 commit comments

Comments
 (0)