Skip to content

Commit 24b9361

Browse files
committed
Merge remote-tracking branch 'origin/main' into user-jvm-udf
2 parents 22439e3 + 22b7bed commit 24b9361

317 files changed

Lines changed: 2518 additions & 26891 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.asf.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ github:
4141
features:
4242
issues: true
4343
discussions: true
44+
projects: true
4445
protected_branches:
4546
main:
4647
required_pull_request_reviews:

.github/workflows/docker-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,6 @@ jobs:
7474
with:
7575
platforms: linux/amd64,linux/arm64
7676
push: true
77-
tags: ghcr.io/apache/datafusion-comet:spark-3.5-scala-2.12-${{ env.COMET_VERSION }}
77+
tags: ghcr.io/apache/datafusion-comet:spark-4.1-scala-2.13-${{ env.COMET_VERSION }}
7878
file: kube/Dockerfile
7979
no-cache: true

.github/workflows/pr_build_linux.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ jobs:
355355
org.apache.comet.exec.CometWindowExecSuite
356356
org.apache.comet.exec.CometJoinSuite
357357
org.apache.comet.CometNativeSuite
358+
org.apache.comet.CometSetOpWithGroupBySuite
358359
org.apache.comet.CometSparkSessionExtensionsSuite
359360
org.apache.comet.CometUserUdfSuite
360361
org.apache.spark.CometPluginsSuite
@@ -451,6 +452,8 @@ jobs:
451452
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
452453
container:
453454
image: amd64/rust
455+
env:
456+
JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
454457
steps:
455458
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
456459

@@ -460,7 +463,7 @@ jobs:
460463
uses: ./.github/actions/setup-builder
461464
with:
462465
rust-version: ${{ env.RUST_VERSION }}
463-
jdk-version: 11
466+
jdk-version: 17
464467

465468
- name: Download native library
466469
uses: actions/download-artifact@v8
@@ -505,6 +508,8 @@ jobs:
505508
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
506509
container:
507510
image: amd64/rust
511+
env:
512+
JAVA_TOOL_OPTIONS: --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
508513
strategy:
509514
matrix:
510515
join: [sort_merge, broadcast, hash]
@@ -518,7 +523,7 @@ jobs:
518523
uses: ./.github/actions/setup-builder
519524
with:
520525
rust-version: ${{ env.RUST_VERSION }}
521-
jdk-version: 11
526+
jdk-version: 17
522527

523528
- name: Download native library
524529
uses: actions/download-artifact@v8

.github/workflows/pr_build_macos.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ jobs:
194194
org.apache.comet.exec.CometWindowExecSuite
195195
org.apache.comet.exec.CometJoinSuite
196196
org.apache.comet.CometNativeSuite
197+
org.apache.comet.CometSetOpWithGroupBySuite
197198
org.apache.comet.CometSparkSessionExtensionsSuite
198199
org.apache.comet.CometUserUdfSuite
199200
org.apache.spark.CometPluginsSuite

common/src/main/scala/org/apache/comet/parquet/CometParquetUtils.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ object CometParquetUtils {
2929
private val PARQUET_FIELD_ID_READ_ENABLED = "spark.sql.parquet.fieldId.read.enabled"
3030
private val IGNORE_MISSING_PARQUET_FIELD_ID = "spark.sql.parquet.fieldId.read.ignoreMissing"
3131

32+
// Field-metadata key arrow-rs writes when it lifts Parquet field IDs into the Arrow schema
33+
// (`parquet::arrow::PARQUET_FIELD_ID_META_KEY`). Spark's local key for the same concept is
34+
// `parquet.field.id` (`ParquetUtils.FIELD_ID_METADATA_KEY`). The serde translates at the proto
35+
// boundary so the native side can match the same key it gets from arrow-rs.
36+
val PARQUET_FIELD_ID_META_KEY = "PARQUET:field_id"
37+
3238
// Map of encryption configuration key-value pairs that, if present, are only supported with
3339
// these specific values. Generally, these are the default values that won't be present,
3440
// but if they are present we want to check them.

dev/diffs/4.1.1.diff

Lines changed: 0 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -150,50 +150,6 @@ index 4410fe50912..43bcce2a038 100644
150150
case _ => Map[String, String]()
151151
}
152152
val childrenInfo = children.flatMap {
153-
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out
154-
index 69b4001ff34..6fda691652d 100644
155-
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out
156-
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/intersect-all.sql.out
157-
@@ -1,7 +1,7 @@
158-
-- Automatically generated by SQLQueryTestSuite
159-
-- !query
160-
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
161-
- (1, 2),
162-
+ (1, 2),
163-
(1, 2),
164-
(1, 3),
165-
(1, 3),
166-
@@ -11,7 +11,7 @@ CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
167-
AS tab1(k, v)
168-
-- !query analysis
169-
CreateViewCommand `tab1`, SELECT * FROM VALUES
170-
- (1, 2),
171-
+ (1, 2),
172-
(1, 2),
173-
(1, 3),
174-
(1, 3),
175-
@@ -26,8 +26,8 @@ CreateViewCommand `tab1`, SELECT * FROM VALUES
176-
177-
-- !query
178-
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
179-
- (1, 2),
180-
- (1, 2),
181-
+ (1, 2),
182-
+ (1, 2),
183-
(2, 3),
184-
(3, 4),
185-
(null, null),
186-
@@ -35,8 +35,8 @@ CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
187-
AS tab2(k, v)
188-
-- !query analysis
189-
CreateViewCommand `tab2`, SELECT * FROM VALUES
190-
- (1, 2),
191-
- (1, 2),
192-
+ (1, 2),
193-
+ (1, 2),
194-
(2, 3),
195-
(3, 4),
196-
(null, null),
197153
diff --git a/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql
198154
index 13bbd9d81b7..541cdfb1e04 100644
199155
--- a/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql
@@ -211,18 +167,6 @@ index 13bbd9d81b7..541cdfb1e04 100644
211167
CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b;
212168

213169
-- division, remainder and pmod by 0 return NULL
214-
diff --git a/sql/core/src/test/resources/sql-tests/inputs/except-all.sql b/sql/core/src/test/resources/sql-tests/inputs/except-all.sql
215-
index e28f0721a64..788b43c242a 100644
216-
--- a/sql/core/src/test/resources/sql-tests/inputs/except-all.sql
217-
+++ b/sql/core/src/test/resources/sql-tests/inputs/except-all.sql
218-
@@ -1,3 +1,7 @@
219-
+-- TODO(https://github.com/apache/datafusion-comet/issues/4122)
220-
+-- EXCEPT ALL with GROUP BY returns incorrect results on Spark 4.1
221-
+--SET spark.comet.enabled = false
222-
+
223-
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
224-
(0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1);
225-
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
226170
diff --git a/sql/core/src/test/resources/sql-tests/inputs/explain-aqe.sql b/sql/core/src/test/resources/sql-tests/inputs/explain-aqe.sql
227171
index 7aef901da4f..f3d6e18926d 100644
228172
--- a/sql/core/src/test/resources/sql-tests/inputs/explain-aqe.sql
@@ -280,32 +224,6 @@ index 35128da97fd..25b873ae859 100644
280224
-- Positive test cases
281225
-- Create a table with some testing data.
282226
DROP TABLE IF EXISTS t1;
283-
diff --git a/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql b/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql
284-
index 077caa5dd44..697457d4251 100644
285-
--- a/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql
286-
+++ b/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql
287-
@@ -1,5 +1,9 @@
288-
+-- TODO(https://github.com/apache/datafusion-comet/issues/4122)
289-
+-- INTERSECT ALL with GROUP BY returns incorrect results on Spark 4.1
290-
+--SET spark.comet.enabled = false
291-
+
292-
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
293-
- (1, 2),
294-
+ (1, 2),
295-
(1, 2),
296-
(1, 3),
297-
(1, 3),
298-
@@ -8,8 +12,8 @@ CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
299-
(null, null)
300-
AS tab1(k, v);
301-
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
302-
- (1, 2),
303-
- (1, 2),
304-
+ (1, 2),
305-
+ (1, 2),
306-
(2, 3),
307-
(3, 4),
308-
(null, null),
309227
diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql
310228
index 41fd4de2a09..162d5a817b6 100644
311229
--- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql
@@ -428,30 +346,6 @@ index 21a3ce1e122..f4762ab98f0 100644
428346
SET spark.sql.ansi.enabled = false;
429347

430348
-- In COMPENSATION views get invalidated if the type can't cast
431-
diff --git a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out
432-
index 44f95f225ab..361866fc298 100644
433-
--- a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out
434-
+++ b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out
435-
@@ -1,7 +1,7 @@
436-
-- Automatically generated by SQLQueryTestSuite
437-
-- !query
438-
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
439-
- (1, 2),
440-
+ (1, 2),
441-
(1, 2),
442-
(1, 3),
443-
(1, 3),
444-
@@ -17,8 +17,8 @@ struct<>
445-
446-
-- !query
447-
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
448-
- (1, 2),
449-
- (1, 2),
450-
+ (1, 2),
451-
+ (1, 2),
452-
(2, 3),
453-
(3, 4),
454-
(null, null),
455349
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
456350
index 0d807aeae4d..6d7744e771b 100644
457351
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala

docs/source/contributor-guide/benchmarking_aws_ec2.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ make release
104104
Set `COMET_JAR` environment variable.
105105

106106
```shell
107-
export COMET_JAR=/home/ec2-user/datafusion-comet/spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar
107+
export COMET_JAR=/home/ec2-user/datafusion-comet/spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar
108108
```
109109

110110
## Run Benchmarks

docs/source/contributor-guide/benchmarking_macos.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,13 @@ export DF_BENCH=`pwd`
5555

5656
## Install Spark
5757

58-
Install Apache Spark. This example refers to 3.5.4 version.
58+
Install Apache Spark. This example refers to 4.1.1 version.
5959

6060
```shell
61-
wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
62-
tar xzf spark-3.5.4-bin-hadoop3.tgz
63-
sudo mv spark-3.5.4-bin-hadoop3 /opt
64-
export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3/
61+
wget https://archive.apache.org/dist/spark/spark-4.1.1/spark-4.1.1-bin-hadoop3.tgz
62+
tar xzf spark-4.1.1-bin-hadoop3.tgz
63+
sudo mv spark-4.1.1-bin-hadoop3 /opt
64+
export SPARK_HOME=/opt/spark-4.1.1-bin-hadoop3/
6565
```
6666

6767
Start Spark in standalone mode:
@@ -129,7 +129,7 @@ make release COMET_FEATURES=mimalloc
129129
Set `COMET_JAR` to point to the location of the Comet jar file. Example for Comet 0.8
130130

131131
```shell
132-
export COMET_JAR=`pwd`/spark/target/comet-spark-spark3.5_2.12-0.8.0-SNAPSHOT.jar
132+
export COMET_JAR=`pwd`/spark/target/comet-spark-spark4.1_2.13-0.8.0-SNAPSHOT.jar
133133
```
134134

135135
Run the following command (the `--data` parameter will need to be updated to point to your S3 bucket):

docs/source/contributor-guide/benchmarking_spark_sql_perf.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ partitioning and writing to Parquet format automatically.
3434

3535
## Prerequisites
3636

37-
- Java 17 (for Spark 3.5+)
38-
- Apache Spark 3.5.x
37+
- Java 17
38+
- Apache Spark 4.1.x
3939
- SBT (Scala Build Tool)
4040
- C compiler toolchain (`gcc`, `make`, `flex`, `bison`, `byacc`)
4141

@@ -225,7 +225,7 @@ Build Comet from source and launch `spark-shell` with both the Comet and spark-s
225225

226226
```shell
227227
make release
228-
export COMET_JAR=$(pwd)/spark/target/comet-spark-spark3.5_2.12-*.jar
228+
export COMET_JAR=$(pwd)/spark/target/comet-spark-spark4.1_2.13-*.jar
229229

230230
$SPARK_HOME/bin/spark-shell \
231231
--master $SPARK_MASTER \

docs/source/contributor-guide/debugging.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ make release COMET_FEATURES=backtrace
136136
Set `RUST_BACKTRACE=1` for the Spark worker/executor process, or for `spark-submit` if running in local mode.
137137

138138
```console
139-
RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true
139+
RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true
140140
```
141141

142142
Get the expanded exception details

0 commit comments

Comments
 (0)