Skip to content

Commit 0a41047

Browse files
zhuqi-lucasclaude
andcommitted
feat: generate reversed-name data for sort pushdown benchmark
Generate benchmark data with file names that don't match sort key order: c_high.parquet: l_orderkey 1-200k (c sorts last, but has lowest keys) b_mid.parquet: l_orderkey 200k-400k a_low.parquet: l_orderkey 400k+ (a sorts first, but has highest keys) This ensures the sort pushdown optimizer must reorder files by statistics to achieve sort elimination. Without the optimization, SortExec stays. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ccaf802 commit 0a41047

File tree

1 file changed

+66
-6
lines changed

1 file changed

+66
-6
lines changed

benchmarks/bench.sh

Lines changed: 66 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,7 @@ main() {
314314
data_tpch "1" "parquet"
315315
;;
316316
sort_pushdown|sort_pushdown_sorted)
317-
# same data as for tpch
318-
data_tpch "1" "parquet"
317+
data_sort_pushdown
319318
;;
320319
sort_tpch)
321320
# same data as for tpch
@@ -1085,19 +1084,80 @@ run_external_aggr() {
10851084
}
10861085

10871086
# Runs the sort pushdown benchmark (without WITH ORDER)
1088-
run_sort_pushdown() {
1087+
# Generates sort pushdown benchmark data: multiple sorted parquet files with
1088+
# reversed naming so alphabetical order does NOT match sort key order.
1089+
# This forces the sort pushdown optimizer to reorder files by statistics.
1090+
#
1091+
# Files created (l_orderkey ranges):
1092+
# c_high.parquet: 1-200000 (c sorts last alphabetically, but has lowest keys)
1093+
# b_mid.parquet: 200001-400000
1094+
# a_low.parquet: 400001-600001 (a sorts first alphabetically, but has highest keys)
1095+
data_sort_pushdown() {
1096+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
1097+
if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
1098+
echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
1099+
return
1100+
fi
1101+
1102+
echo "Generating sort pushdown benchmark data..."
1103+
1104+
# First ensure we have TPC-H data to work with
1105+
data_tpch "1" "parquet"
1106+
10891107
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1108+
mkdir -p "${SORT_PUSHDOWN_DIR}"
1109+
1110+
# Build datafusion-cli if needed
1111+
$CARGO_COMMAND --bin datafusion-cli
1112+
1113+
DATAFUSION_CLI="./target/release/datafusion-cli"
1114+
if [ ! -f "$DATAFUSION_CLI" ]; then
1115+
DATAFUSION_CLI="./target/debug/datafusion-cli"
1116+
fi
1117+
1118+
echo "Creating sorted parquet files with reversed naming..."
1119+
1120+
$DATAFUSION_CLI --command "
1121+
CREATE EXTERNAL TABLE lineitem
1122+
STORED AS PARQUET
1123+
LOCATION '${TPCH_DIR}/lineitem/';
1124+
1125+
COPY (
1126+
SELECT * FROM lineitem
1127+
WHERE l_orderkey <= 200000
1128+
ORDER BY l_orderkey ASC
1129+
) TO '${SORT_PUSHDOWN_DIR}/c_high.parquet';
1130+
1131+
COPY (
1132+
SELECT * FROM lineitem
1133+
WHERE l_orderkey > 200000 AND l_orderkey <= 400000
1134+
ORDER BY l_orderkey ASC
1135+
) TO '${SORT_PUSHDOWN_DIR}/b_mid.parquet';
1136+
1137+
COPY (
1138+
SELECT * FROM lineitem
1139+
WHERE l_orderkey > 400000
1140+
ORDER BY l_orderkey ASC
1141+
) TO '${SORT_PUSHDOWN_DIR}/a_low.parquet';
1142+
"
1143+
1144+
echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
1145+
ls -la "${SORT_PUSHDOWN_DIR}"
1146+
}
1147+
1148+
run_sort_pushdown() {
1149+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10901150
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
10911151
echo "Running sort pushdown benchmark (no WITH ORDER)..."
1092-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1152+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
10931153
}
10941154

10951155
# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
10961156
run_sort_pushdown_sorted() {
1097-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1157+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10981158
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
10991159
echo "Running sort pushdown benchmark (with WITH ORDER)..."
1100-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1160+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
11011161
}
11021162

11031163
# Runs the sort integration benchmark

0 commit comments

Comments
 (0)