Skip to content

Commit 40e6880

Browse files
authored
Merge branch 'main' into propagate-empty-outer-join-null-pad
2 parents 56c0c62 + 18af518 commit 40e6880

44 files changed

Lines changed: 2217 additions & 269 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.asf.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ github:
5151
main:
5252
required_pull_request_reviews:
5353
required_approving_review_count: 1
54+
required_status_checks:
55+
contexts:
56+
- "Check License Header"
57+
- "Use prettier to check formatting of documents"
58+
- "Validate required_status_checks in .asf.yaml"
59+
- "Spell Check with Typos"
5460
# needs to be updated as part of the release process
5561
# .asf.yaml doesn't support wildcard branch protection rules, only exact branch names
5662
# https://github.com/apache/infrastructure-asfyaml?tab=readme-ov-file#branch-protection

.github/workflows/dev.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ jobs:
5151
# if you encounter error, see instructions inside the script
5252
run: ci/scripts/doc_prettier_check.sh
5353

54+
asf-yaml-check:
55+
name: Validate required_status_checks in .asf.yaml
56+
runs-on: ubuntu-latest
57+
steps:
58+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
59+
- run: pip install pyyaml
60+
- run: python3 ci/scripts/check_asf_yaml_status_checks.py
61+
5462
typos:
5563
name: Spell Check with Typos
5664
runs-on: ubuntu-latest

.github/workflows/extended.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=8,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
6565
# note: do not use amd/rust container to preserve disk space
6666
steps:
67-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
67+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
6868
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
6969
with:
7070
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
@@ -91,7 +91,7 @@ jobs:
9191
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
9292
# note: do not use amd/rust container to preserve disk space
9393
steps:
94-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
94+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
9595
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
9696
with:
9797
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
@@ -138,7 +138,7 @@ jobs:
138138
container:
139139
image: amd64/rust
140140
steps:
141-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
141+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
142142
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
143143
with:
144144
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
@@ -160,7 +160,7 @@ jobs:
160160
container:
161161
image: amd64/rust
162162
steps:
163-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
163+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
164164
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
165165
with:
166166
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push

.github/workflows/rust.yml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
container:
5252
image: amd64/rust
5353
steps:
54-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
54+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
5555
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
5656
- name: Setup Rust toolchain
5757
uses: ./.github/actions/setup-builder
@@ -142,7 +142,7 @@ jobs:
142142
container:
143143
image: amd64/rust
144144
steps:
145-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
145+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
146146
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
147147
- name: Setup Rust toolchain
148148
uses: ./.github/actions/setup-builder
@@ -174,7 +174,7 @@ jobs:
174174
container:
175175
image: amd64/rust
176176
steps:
177-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
177+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
178178
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
179179
- name: Setup Rust toolchain
180180
uses: ./.github/actions/setup-builder
@@ -277,7 +277,7 @@ jobs:
277277
volumes:
278278
- /usr/local:/host/usr/local
279279
steps:
280-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
280+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
281281
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
282282
with:
283283
submodules: true
@@ -305,7 +305,7 @@ jobs:
305305
--lib \
306306
--tests \
307307
--bins \
308-
--features serde,avro,json,backtrace,integration-tests,parquet_encryption
308+
--features serde,avro,json,backtrace,integration-tests,parquet_encryption,substrait
309309
- name: Verify Working Directory Clean
310310
run: git diff --exit-code
311311
# Check no temporary directories created during test.
@@ -324,7 +324,7 @@ jobs:
324324
needs: linux-build-lib
325325
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
326326
steps:
327-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
327+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
328328
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
329329
with:
330330
submodules: true
@@ -356,7 +356,7 @@ jobs:
356356
container:
357357
image: amd64/rust
358358
steps:
359-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
359+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
360360
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
361361
with:
362362
submodules: true
@@ -387,7 +387,7 @@ jobs:
387387
container:
388388
image: amd64/rust
389389
steps:
390-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
390+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
391391
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
392392
with:
393393
submodules: true
@@ -409,7 +409,7 @@ jobs:
409409
container:
410410
image: amd64/rust
411411
steps:
412-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
412+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
413413
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
414414
- name: Setup Rust toolchain
415415
uses: ./.github/actions/setup-builder
@@ -450,7 +450,7 @@ jobs:
450450
container:
451451
image: amd64/rust
452452
steps:
453-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
453+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
454454
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
455455
with:
456456
submodules: true
@@ -473,7 +473,7 @@ jobs:
473473
export RUST_MIN_STACK=20971520
474474
export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data`
475475
cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1
476-
INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption --profile ci --package datafusion-sqllogictest --test sqllogictests
476+
INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption,substrait --profile ci --package datafusion-sqllogictest --test sqllogictests
477477
- name: Verify Working Directory Clean
478478
run: git diff --exit-code
479479

@@ -498,7 +498,7 @@ jobs:
498498
--health-timeout 5s
499499
--health-retries 5
500500
steps:
501-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
501+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
502502
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
503503
with:
504504
submodules: true
@@ -523,7 +523,7 @@ jobs:
523523
container:
524524
image: amd64/rust
525525
steps:
526-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
526+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
527527
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
528528
with:
529529
submodules: true
@@ -537,7 +537,7 @@ jobs:
537537
# command cannot be run for all the .slt files. Run it for just one that works (limit.slt)
538538
# until most of the tickets in https://github.com/apache/datafusion/issues/16248 are addressed
539539
# and this command can be run without filters.
540-
run: cargo test --test sqllogictests -- --substrait-round-trip limit.slt
540+
run: cargo test -p datafusion-sqllogictest --test sqllogictests --features substrait -- --substrait-round-trip limit.slt
541541

542542
# Temporarily commenting out the Windows flow, the reason is enormously slow running build
543543
# Waiting for new Windows 2025 github runner
@@ -570,7 +570,7 @@ jobs:
570570
uses: ./.github/actions/setup-macos-aarch64-builder
571571
- name: Run tests (excluding doctests)
572572
shell: bash
573-
run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests
573+
run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests,substrait
574574

575575
vendor:
576576
name: Verify Vendored Code
@@ -654,7 +654,7 @@ jobs:
654654
container:
655655
image: amd64/rust
656656
steps:
657-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
657+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
658658
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
659659
with:
660660
submodules: true
@@ -701,7 +701,7 @@ jobs:
701701
container:
702702
image: amd64/rust
703703
steps:
704-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
704+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
705705
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
706706
with:
707707
submodules: true

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,6 @@ datafusion-examples/examples/datafusion-examples/
7878

7979
# Samply profile data
8080
profile.json.gz
81+
82+
# Claude Code personal settings
83+
.claude/settings.local.json

Cargo.lock

Lines changed: 8 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ liblzma = { version = "0.4.6", features = ["static"] }
170170
log = "^0.4"
171171
memchr = "2.8.0"
172172
num-traits = { version = "0.2" }
173-
object_store = { version = "0.13.1", default-features = false }
173+
object_store = { version = "0.13.2", default-features = false }
174174
parking_lot = "0.12"
175175
parquet = { version = "58.1.0", default-features = false, features = [
176176
"arrow",

benchmarks/bench.sh

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,7 @@ main() {
314314
data_tpch "1" "parquet"
315315
;;
316316
sort_pushdown|sort_pushdown_sorted)
317-
# same data as for tpch
318-
data_tpch "1" "parquet"
317+
data_sort_pushdown
319318
;;
320319
sort_tpch)
321320
# same data as for tpch
@@ -1085,19 +1084,57 @@ run_external_aggr() {
10851084
}
10861085

10871086
# Runs the sort pushdown benchmark (without WITH ORDER)
1087+
# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
1088+
# renamed so alphabetical order does NOT match sort key order.
1089+
# This forces the sort pushdown optimizer to reorder files by statistics.
1090+
#
1091+
# tpchgen produces 3 sorted, non-overlapping parquet files:
1092+
# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
1093+
# lineitem.2.parquet: l_orderkey 2M ~ 4M
1094+
# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
1095+
#
1096+
# We rename them so alphabetical order is reversed:
1097+
# a_part3.parquet (highest keys, sorts first alphabetically)
1098+
# b_part2.parquet
1099+
# c_part1.parquet (lowest keys, sorts last alphabetically)
1100+
data_sort_pushdown() {
1101+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
1102+
if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
1103+
echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
1104+
return
1105+
fi
1106+
1107+
echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..."
1108+
1109+
TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
1110+
mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"
1111+
1112+
tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"
1113+
1114+
# Rename: reverse alphabetical order vs key order
1115+
mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet"
1116+
mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet"
1117+
mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet"
1118+
1119+
rm -rf "${TEMP_DIR}"
1120+
1121+
echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
1122+
ls -la "${SORT_PUSHDOWN_DIR}"
1123+
}
1124+
10881125
run_sort_pushdown() {
1089-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1126+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10901127
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
10911128
echo "Running sort pushdown benchmark (no WITH ORDER)..."
1092-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1129+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
10931130
}
10941131

10951132
# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
10961133
run_sort_pushdown_sorted() {
1097-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1134+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10981135
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
10991136
echo "Running sort pushdown benchmark (with WITH ORDER)..."
1100-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1137+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
11011138
}
11021139

11031140
# Runs the sort integration benchmark

0 commit comments

Comments
 (0)