Skip to content

Commit 728227b

Browse files
authored
Merge branch 'main' into fix/3429-spark4-ctas-union-native-writer
2 parents fdc2514 + 48f7b03 commit 728227b

77 files changed

Lines changed: 1651 additions & 1392 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.claude/skills/bug-triage/SKILL.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,7 @@ For each issue, review the title and body and determine:
6565
2. **Area labels** (zero or more): from the area table in the guide
6666
(`area:writer`, `area:shuffle`, `area:aggregation`, `area:scan`,
6767
`area:expressions`, `area:ffi`, `area:ci`) plus the pre-existing area
68-
indicators (`native_datafusion`, `native_iceberg_compat`, `spark 4`,
69-
`spark sql tests`).
68+
indicators (`spark 4`, `spark sql tests`).
7069
3. **Escalation note**: if the issue matches an escalation trigger from the
7170
guide (e.g., a `priority:high` crash that may also produce wrong results),
7271
note it in the summary.

.github/workflows/codeql.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ jobs:
4949
persist-credentials: false
5050

5151
- name: Initialize CodeQL
52-
uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
52+
uses: github/codeql-action/init@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4
5353
with:
5454
languages: actions
5555

5656
- name: Perform CodeQL Analysis
57-
uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4
57+
uses: github/codeql-action/analyze@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4
5858
with:
5959
category: "/language:actions"

.github/workflows/iceberg_spark_test.yml

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,41 @@ on:
2525
push:
2626
branches:
2727
- main
28-
paths-ignore:
29-
- "benchmarks/**"
30-
- "doc/**"
31-
- "docs/**"
32-
- "**.md"
33-
- "native/core/benches/**"
34-
- "native/spark-expr/benches/**"
35-
- "spark/src/test/**"
36-
- "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
37-
- "spark-integration/**"
28+
paths:
29+
- "native/**/src/**"
30+
- "native/**/Cargo.toml"
31+
- "native/Cargo.lock"
32+
- "!native/hdfs/**"
33+
- "!native/fs-hdfs/**"
34+
- "common/src/main/**"
35+
- "common/pom.xml"
36+
- "spark/src/main/**"
37+
- "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
38+
- "spark/pom.xml"
39+
- "dev/diffs/iceberg/**"
40+
- "pom.xml"
41+
- "rust-toolchain.toml"
42+
- ".github/workflows/iceberg_spark_test.yml"
43+
- ".github/actions/setup-builder/**"
44+
- ".github/actions/setup-iceberg-builder/**"
3845
pull_request:
39-
paths-ignore:
40-
- "benchmarks/**"
41-
- "doc/**"
42-
- "docs/**"
43-
- "**.md"
44-
- "native/core/benches/**"
45-
- "native/spark-expr/benches/**"
46-
- "spark/src/test/**"
47-
- "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
48-
- "spark-integration/**"
46+
paths:
47+
- "native/**/src/**"
48+
- "native/**/Cargo.toml"
49+
- "native/Cargo.lock"
50+
- "!native/hdfs/**"
51+
- "!native/fs-hdfs/**"
52+
- "common/src/main/**"
53+
- "common/pom.xml"
54+
- "spark/src/main/**"
55+
- "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
56+
- "spark/pom.xml"
57+
- "dev/diffs/iceberg/**"
58+
- "pom.xml"
59+
- "rust-toolchain.toml"
60+
- ".github/workflows/iceberg_spark_test.yml"
61+
- ".github/actions/setup-builder/**"
62+
- ".github/actions/setup-iceberg-builder/**"
4963
# manual trigger
5064
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
5165
workflow_dispatch:

.github/workflows/miri.yml

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,26 +22,9 @@ concurrency:
2222
cancel-in-progress: true
2323

2424
on:
25-
push:
26-
branches:
27-
- main
28-
paths-ignore:
29-
- "doc/**"
30-
- "docs/**"
31-
- "**.md"
32-
- "native/core/benches/**"
33-
- "native/spark-expr/benches/**"
34-
- "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
35-
- "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
36-
pull_request:
37-
paths-ignore:
38-
- "doc/**"
39-
- "docs/**"
40-
- "**.md"
41-
- "native/core/benches/**"
42-
- "native/spark-expr/benches/**"
43-
- "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
44-
- "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
25+
# nightly safety check
26+
schedule:
27+
- cron: '0 4 * * *'
4528
# manual trigger
4629
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
4730
workflow_dispatch:

.github/workflows/pr_build_linux.yml

Lines changed: 72 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -22,28 +22,55 @@ concurrency:
2222
cancel-in-progress: true
2323

2424
on:
25+
# Allow-list of paths that affect this workflow. A change must match a positive
26+
# pattern (and not a trailing "!" exclusion) for the build to run. Editing
27+
# pr_build_macos.yml does not trigger this workflow, and vice versa.
2528
push:
2629
branches:
2730
- main
28-
paths-ignore:
29-
- "benchmarks/**"
30-
- "doc/**"
31-
- "docs/**"
32-
- "**.md"
33-
- "native/core/benches/**"
34-
- "native/spark-expr/benches/**"
35-
- "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
36-
- "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
31+
paths:
32+
- "native/**"
33+
- "common/**"
34+
- "spark/**"
35+
- "spark-integration/**"
36+
- "pom.xml"
37+
- "**/pom.xml"
38+
- ".mvn/**"
39+
- "mvnw"
40+
- "Makefile"
41+
- "rust-toolchain.toml"
42+
- "dev/ci/**"
43+
- ".github/workflows/pr_build_linux.yml"
44+
- ".github/actions/setup-builder/**"
45+
- ".github/actions/java-test/**"
46+
- ".github/actions/rust-test/**"
47+
- "!**.md"
48+
- "!native/core/benches/**"
49+
- "!native/spark-expr/benches/**"
50+
- "!spark/src/test/scala/org/apache/spark/sql/benchmark/**"
51+
- "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
3752
pull_request:
38-
paths-ignore:
39-
- "benchmarks/**"
40-
- "doc/**"
41-
- "docs/**"
42-
- "**.md"
43-
- "native/core/benches/**"
44-
- "native/spark-expr/benches/**"
45-
- "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
46-
- "spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
53+
paths:
54+
- "native/**"
55+
- "common/**"
56+
- "spark/**"
57+
- "spark-integration/**"
58+
- "pom.xml"
59+
- "**/pom.xml"
60+
- ".mvn/**"
61+
- "mvnw"
62+
- "Makefile"
63+
- "rust-toolchain.toml"
64+
- "dev/ci/**"
65+
- ".github/workflows/pr_build_linux.yml"
66+
- ".github/actions/setup-builder/**"
67+
- ".github/actions/java-test/**"
68+
- ".github/actions/rust-test/**"
69+
- "!**.md"
70+
- "!native/core/benches/**"
71+
- "!native/spark-expr/benches/**"
72+
- "!spark/src/test/scala/org/apache/spark/sql/benchmark/**"
73+
- "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala"
4774
# manual trigger
4875
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
4976
workflow_dispatch:
@@ -295,14 +322,28 @@ jobs:
295322
- name: "Spark 4.2, JDK 17"
296323
java_version: "17"
297324
maven_opts: "-Pspark-4.2"
325+
# Suites are grouped by functional area into balanced buckets so that no test
326+
# job runs much longer than ~23 min. See
327+
# docs/superpowers/specs/2026-05-22-pr-build-consolidation-design.md for the
328+
# per-suite timing analysis behind this grouping.
298329
suite:
299-
- name: "fuzz"
330+
- name: "scans"
300331
value: |
332+
org.apache.comet.parquet.CometParquetWriterSuite
333+
org.apache.comet.parquet.ParquetReadV1Suite
334+
org.apache.comet.parquet.ParquetReadV2Suite
335+
org.apache.comet.parquet.ParquetReadFromFakeHadoopFsSuite
336+
org.apache.comet.parquet.ParquetTimestampLtzAsNtzSuite
337+
org.apache.spark.sql.comet.ParquetDatetimeRebaseV1Suite
338+
org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
339+
org.apache.spark.sql.comet.ParquetEncryptionITCase
340+
org.apache.comet.exec.CometNativeReaderSuite
341+
org.apache.comet.CometIcebergNativeSuite
342+
org.apache.comet.CometIcebergRewriteActionSuite
343+
org.apache.comet.iceberg.IcebergReflectionSuite
344+
org.apache.comet.csv.CometCsvNativeReadSuite
301345
org.apache.comet.CometFuzzTestSuite
302-
org.apache.comet.CometFuzzAggregateSuite
303346
org.apache.comet.CometFuzzIcebergSuite
304-
org.apache.comet.CometFuzzMathSuite
305-
org.apache.comet.CometCodegenFuzzSuite
306347
org.apache.comet.DataGeneratorSuite
307348
- name: "shuffle"
308349
value: |
@@ -316,23 +357,6 @@ jobs:
316357
org.apache.comet.exec.DisableAQECometShuffleSuite
317358
org.apache.comet.exec.DisableAQECometAsyncShuffleSuite
318359
org.apache.spark.shuffle.sort.SpillSorterSuite
319-
- name: "parquet"
320-
value: |
321-
org.apache.comet.parquet.CometParquetWriterSuite
322-
org.apache.comet.parquet.ParquetReadV1Suite
323-
org.apache.comet.parquet.ParquetReadV2Suite
324-
org.apache.comet.parquet.ParquetReadFromFakeHadoopFsSuite
325-
org.apache.comet.parquet.ParquetTimestampLtzAsNtzSuite
326-
org.apache.spark.sql.comet.ParquetDatetimeRebaseV1Suite
327-
org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite
328-
org.apache.spark.sql.comet.ParquetEncryptionITCase
329-
org.apache.comet.exec.CometNativeReaderSuite
330-
org.apache.comet.CometIcebergNativeSuite
331-
org.apache.comet.CometIcebergRewriteActionSuite
332-
org.apache.comet.iceberg.IcebergReflectionSuite
333-
- name: "csv"
334-
value: |
335-
org.apache.comet.csv.CometCsvNativeReadSuite
336360
- name: "exec"
337361
value: |
338362
org.apache.comet.exec.CometAggregateSuite
@@ -360,6 +384,9 @@ jobs:
360384
org.apache.spark.sql.comet.CometShuffleFallbackStickinessSuite
361385
org.apache.spark.sql.comet.CometDecimalArithmeticViewSuite
362386
org.apache.comet.objectstore.NativeConfigSuite
387+
org.apache.spark.sql.CometToPrettyStringSuite
388+
org.apache.spark.sql.CometCollationSuite
389+
org.apache.comet.CometFuzzAggregateSuite
363390
- name: "expressions"
364391
value: |
365392
org.apache.comet.CometExpressionSuite
@@ -376,18 +403,15 @@ jobs:
376403
org.apache.comet.CometMapExpressionSuite
377404
org.apache.comet.CometCsvExpressionSuite
378405
org.apache.comet.CometJsonExpressionSuite
379-
org.apache.comet.CometDateTimeUtilsSuite
380406
org.apache.comet.SparkErrorConverterSuite
381407
org.apache.comet.expressions.conditional.CometIfSuite
382408
org.apache.comet.expressions.conditional.CometCoalesceSuite
383409
org.apache.comet.expressions.conditional.CometCaseWhenSuite
384410
org.apache.comet.CometCodegenSuite
385411
org.apache.comet.CometCodegenSourceSuite
386412
org.apache.comet.CometCodegenHOFSuite
387-
- name: "sql"
388-
value: |
389-
org.apache.spark.sql.CometToPrettyStringSuite
390-
org.apache.spark.sql.CometCollationSuite
413+
org.apache.comet.CometFuzzMathSuite
414+
org.apache.comet.CometCodegenFuzzSuite
391415
fail-fast: false
392416
name: ${{ matrix.profile.name }} [${{ matrix.suite.name }}]
393417
runs-on: ubuntu-24.04
@@ -427,7 +451,7 @@ jobs:
427451
uses: ./.github/actions/java-test
428452
with:
429453
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
430-
suites: ${{ matrix.suite.name == 'sql' && matrix.profile.name == 'Spark 3.4, JDK 11, Scala 2.12' && '' || matrix.suite.value }}
454+
suites: ${{ matrix.suite.value }}
431455
maven_opts: ${{ matrix.profile.maven_opts }}
432456
upload-test-reports: true
433457
skip-native-build: true
@@ -486,19 +510,16 @@ jobs:
486510
run: |
487511
SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test
488512
489-
# TPC-DS correctness tests - verifies benchmark queries produce correct results
513+
# TPC-DS correctness tests - verifies benchmark queries produce correct results.
514+
# The three join strategies run sequentially in one job so the project is built once.
490515
verify-benchmark-results-tpcds:
491516
needs: build-native
492-
name: Verify TPC-DS Results (${{ matrix.join }})
517+
name: Verify TPC-DS Results
493518
runs-on: ubuntu-24.04
494519
container:
495520
image: amd64/rust
496521
env:
497522
JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
498-
strategy:
499-
matrix:
500-
join: [sort_merge, broadcast, hash]
501-
fail-fast: false
502523
steps:
503524
- uses: actions/checkout@v6
504525

@@ -555,7 +576,6 @@ jobs:
555576
cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1"
556577
557578
- name: Run TPC-DS queries (Sort merge join)
558-
if: matrix.join == 'sort_merge'
559579
run: |
560580
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
561581
env:
@@ -564,15 +584,13 @@ jobs:
564584
spark.sql.join.preferSortMergeJoin=true
565585
566586
- name: Run TPC-DS queries (Broadcast hash join)
567-
if: matrix.join == 'broadcast'
568587
run: |
569588
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
570589
env:
571590
SPARK_TPCDS_JOIN_CONF: |
572591
spark.sql.autoBroadcastJoinThreshold=10485760
573592
574593
- name: Run TPC-DS queries (Shuffled hash join)
575-
if: matrix.join == 'hash'
576594
run: |
577595
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
578596
env:

0 commit comments

Comments
 (0)