@@ -109,6 +109,9 @@ clickbench_extended: ClickBench \"inspired\" queries against a single parquet
109109# Sort Pushdown Benchmarks
110110sort_pushdown: Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
111111sort_pushdown_sorted: Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files
112+ sort_pushdown_inexact: Sort pushdown Inexact path (--sorted DESC) — tests reverse scan + RG reorder
113+ sort_pushdown_inexact_unsorted: Sort pushdown Inexact path (no WITH ORDER) — tests Unsupported path + RG reorder
114+ sort_pushdown_inexact_overlap: Sort pushdown Inexact path — partially overlapping RGs (streaming data scenario)
112115
113116# Sorted Data Benchmarks (ORDER BY Optimization)
114117clickbench_sorted: ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
@@ -316,6 +319,9 @@ main() {
316319 sort_pushdown|sort_pushdown_sorted)
317320 data_sort_pushdown
318321 ;;
322+ sort_pushdown_inexact|sort_pushdown_inexact_unsorted|sort_pushdown_inexact_overlap)
323+ data_sort_pushdown_inexact
324+ ;;
319325 sort_tpch)
320326 # same data as for tpch
321327 data_tpch " 1" " parquet"
@@ -522,6 +528,15 @@ main() {
522528 sort_pushdown_sorted)
523529 run_sort_pushdown_sorted
524530 ;;
531+ sort_pushdown_inexact)
532+ run_sort_pushdown_inexact
533+ ;;
534+ sort_pushdown_inexact_unsorted)
535+ run_sort_pushdown_inexact_unsorted
536+ ;;
537+ sort_pushdown_inexact_overlap)
538+ run_sort_pushdown_inexact_overlap
539+ ;;
525540 sort_tpch)
526541 run_sort_tpch " 1"
527542 ;;
@@ -1137,6 +1152,121 @@ run_sort_pushdown_sorted() {
11371152 debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${SORT_PUSHDOWN_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
11381153}
11391154
1155+ # Generates data for sort pushdown Inexact benchmark.
1156+ #
1157+ # Produces a single large lineitem parquet file where row groups have
1158+ # NON-OVERLAPPING but OUT-OF-ORDER l_orderkey ranges (each RG internally
1159+ # sorted, RGs shuffled). This simulates append-heavy workloads where data
1160+ # is written in batches at different times.
1161+ data_sort_pushdown_inexact () {
1162+ INEXACT_DIR=" ${DATA_DIR} /sort_pushdown_inexact/lineitem"
1163+ if [ -d " ${INEXACT_DIR} " ] && [ " $( ls -A ${INEXACT_DIR} /* .parquet 2> /dev/null) " ]; then
1164+ echo " Sort pushdown Inexact data already exists at ${INEXACT_DIR} "
1165+ return
1166+ fi
1167+
1168+ echo " Generating sort pushdown Inexact benchmark data (single file, shuffled RGs)..."
1169+
1170+ # Re-use the sort_pushdown data as the source (generate if missing)
1171+ data_sort_pushdown
1172+
1173+ mkdir -p " ${INEXACT_DIR} "
1174+ SRC_DIR=" ${DATA_DIR} /sort_pushdown/lineitem"
1175+
1176+ # Use datafusion-cli to bucket rows into 64 groups by a deterministic
1177+ # scrambler, then sort within each bucket by orderkey. This produces
1178+ # ~64 RG-sized segments where each has a tight orderkey range but the
1179+ # segments appear in scrambled (non-sorted) order in the file.
1180+ (cd " ${SCRIPT_DIR} /.." && cargo run --release -p datafusion-cli -- -c "
1181+ CREATE EXTERNAL TABLE src
1182+ STORED AS PARQUET
1183+ LOCATION '${SRC_DIR} ';
1184+
1185+ COPY (
1186+ SELECT * FROM src
1187+ ORDER BY
1188+ (l_orderkey * 1664525 + 1013904223) % 64,
1189+ l_orderkey
1190+ )
1191+ TO '${INEXACT_DIR} /shuffled.parquet'
1192+ STORED AS PARQUET
1193+ OPTIONS ('format.max_row_group_size' '100000');
1194+ " )
1195+
1196+ echo " Sort pushdown Inexact shuffled data generated at ${INEXACT_DIR} "
1197+ ls -la " ${INEXACT_DIR} "
1198+
1199+ # Also generate a file with partially overlapping row groups.
1200+ # Simulates streaming data with network delays: each chunk is mostly
1201+ # in order but has a small overlap with the next chunk (±5% of the
1202+ # chunk range). This is the pattern described by @adriangb — data
1203+ # arriving with timestamps that are generally increasing but with
1204+ # network-induced jitter causing small overlaps between row groups.
1205+ OVERLAP_DIR=" ${DATA_DIR} /sort_pushdown_inexact_overlap/lineitem"
1206+ if [ -d " ${OVERLAP_DIR} " ] && [ " $( ls -A ${OVERLAP_DIR} /* .parquet 2> /dev/null) " ]; then
1207+ echo " Sort pushdown Inexact overlap data already exists at ${OVERLAP_DIR} "
1208+ return
1209+ fi
1210+
1211+ echo " Generating sort pushdown Inexact overlap data (partially overlapping RGs)..."
1212+ mkdir -p " ${OVERLAP_DIR} "
1213+
1214+ (cd " ${SCRIPT_DIR} /.." && cargo run --release -p datafusion-cli -- -c "
1215+ CREATE EXTERNAL TABLE src
1216+ STORED AS PARQUET
1217+ LOCATION '${SRC_DIR} ';
1218+
1219+ -- Add jitter to l_orderkey: shift each row by a random-ish offset
1220+ -- proportional to its position. This creates overlap between adjacent
1221+ -- row groups while preserving the general ascending trend.
1222+ -- Formula: l_orderkey + (l_orderkey * 7 % 5000) - 2500
1223+ -- This adds ±2500 jitter, creating ~5K overlap between adjacent 100K-row RGs.
1224+ COPY (
1225+ SELECT * FROM src
1226+ ORDER BY l_orderkey + (l_orderkey * 7 % 5000) - 2500
1227+ )
1228+ TO '${OVERLAP_DIR} /overlapping.parquet'
1229+ STORED AS PARQUET
1230+ OPTIONS ('format.max_row_group_size' '100000');
1231+ " )
1232+
1233+ echo " Sort pushdown Inexact overlap data generated at ${OVERLAP_DIR} "
1234+ ls -la " ${OVERLAP_DIR} "
1235+ }
1236+
1237+ # Runs the sort pushdown Inexact benchmark (tests RG reorder by statistics).
1238+ # Enables pushdown_filters so TopK's dynamic filter is pushed to the parquet
1239+ # reader for late materialization (only needed for Inexact path).
1240+ run_sort_pushdown_inexact () {
1241+ INEXACT_DIR=" ${DATA_DIR} /sort_pushdown_inexact"
1242+ RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown_inexact.json"
1243+ echo " Running sort pushdown Inexact benchmark (--sorted, DESC, reverse scan path)..."
1244+ DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
1245+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${INEXACT_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown_inexact" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1246+ }
1247+
1248+ # Runs the sort pushdown Inexact benchmark WITHOUT declared ordering.
1249+ # Tests the Unsupported path in try_pushdown_sort where RG reorder by
1250+ # statistics can still help TopK queries without any file ordering guarantee.
1251+ run_sort_pushdown_inexact_unsorted () {
1252+ INEXACT_DIR=" ${DATA_DIR} /sort_pushdown_inexact"
1253+ RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown_inexact_unsorted.json"
1254+ echo " Running sort pushdown Inexact benchmark (no WITH ORDER, Unsupported path)..."
1255+ DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
1256+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path " ${INEXACT_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown_inexact_unsorted" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1257+ }
1258+
1259+ # Runs the sort pushdown benchmark with partially overlapping RGs.
1260+ # Simulates streaming data with network jitter — RGs are mostly in order
1261+ # but have small overlaps (±2500 orderkey jitter between adjacent RGs).
1262+ run_sort_pushdown_inexact_overlap () {
1263+ OVERLAP_DIR=" ${DATA_DIR} /sort_pushdown_inexact_overlap"
1264+ RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown_inexact_overlap.json"
1265+ echo " Running sort pushdown Inexact benchmark (overlapping RGs, streaming data pattern)..."
1266+ DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true \
1267+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path " ${OVERLAP_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown_inexact_overlap" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
1268+ }
1269+
11401270# Runs the sort integration benchmark
11411271run_sort_tpch () {
11421272 SCALE_FACTOR=$1
0 commit comments