Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 43 additions & 6 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,7 @@ main() {
data_tpch "1" "parquet"
;;
sort_pushdown|sort_pushdown_sorted)
# same data as for tpch
data_tpch "1" "parquet"
data_sort_pushdown
;;
sort_tpch)
# same data as for tpch
Expand Down Expand Up @@ -1085,19 +1084,57 @@ run_external_aggr() {
}

# Runs the sort pushdown benchmark (without WITH ORDER)
# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
# renamed so alphabetical order does NOT match sort key order.
# This forces the sort pushdown optimizer to reorder files by statistics.
#
# tpchgen produces 3 sorted, non-overlapping parquet files:
# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
# lineitem.2.parquet: l_orderkey 2M ~ 4M
# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
#
# We rename them so alphabetical order is reversed:
# a_part3.parquet (highest keys, sorts first alphabetically)
# b_part2.parquet
# c_part1.parquet (lowest keys, sorts last alphabetically)
data_sort_pushdown() {
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
return
fi

echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..."

TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"

tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"

# Rename: reverse alphabetical order vs key order
mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet"
mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet"
mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet"

rm -rf "${TEMP_DIR}"

echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
ls -la "${SORT_PUSHDOWN_DIR}"
}

run_sort_pushdown() {
TPCH_DIR="${DATA_DIR}/tpch_sf1"
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
echo "Running sort pushdown benchmark (no WITH ORDER)..."
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}

# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
run_sort_pushdown_sorted() {
TPCH_DIR="${DATA_DIR}/tpch_sf1"
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
echo "Running sort pushdown benchmark (with WITH ORDER)..."
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
}

# Runs the sort integration benchmark
Expand Down
Loading