Skip to content

Commit 160a817

Browse files
Merge branch 'apache:main' into main
2 parents 5ca3888 + 48ebd28 commit 160a817

35 files changed

Lines changed: 1417 additions & 1261 deletions

.github/actions/java-test/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ inputs:
3232
scan_impl:
3333
description: 'The default Parquet scan implementation'
3434
required: false
35-
default: 'native_comet'
35+
default: 'auto'
3636
upload-test-reports:
3737
description: 'Whether to upload test results including coverage to GitHub'
3838
required: false

.github/workflows/pr_build_linux.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ jobs:
164164
- name: "Spark 3.4, JDK 11, Scala 2.12"
165165
java_version: "11"
166166
maven_opts: "-Pspark-3.4 -Pscala-2.12"
167-
scan_impl: "native_comet"
167+
scan_impl: "auto"
168168

169169
- name: "Spark 3.5.5, JDK 17, Scala 2.13"
170170
java_version: "17"
@@ -174,7 +174,7 @@ jobs:
174174
- name: "Spark 3.5.6, JDK 17, Scala 2.13"
175175
java_version: "17"
176176
maven_opts: "-Pspark-3.5 -Dspark.version=3.5.6 -Pscala-2.13"
177-
scan_impl: "native_comet"
177+
scan_impl: "auto"
178178

179179
- name: "Spark 3.5, JDK 17, Scala 2.12"
180180
java_version: "17"

.github/workflows/spark_sql_test.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,12 @@ jobs:
116116
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
117117
# Test combinations:
118118
# - auto scan: all Spark versions (3.4, 3.5, 4.0)
119-
# - native_comet: Spark 3.4, 3.5
120119
# - native_iceberg_compat: Spark 3.5 only
121120
config:
122121
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto', scan-env: ''}
123122
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto', scan-env: ''}
124-
- {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}
125-
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'native_comet', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_comet'}
126-
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'native_comet', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_comet'}
127123
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'native_iceberg_compat', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_iceberg_compat'}
124+
- {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}
128125
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
129126
exclude:
130127
- config: {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}

Makefile

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,26 @@ format:
5151

5252
# build native libs for amd64 architecture Linux/MacOS on a Linux/amd64 machine/container
5353
core-amd64-libs:
54-
cd native && cargo build -j 2 --release $(FEATURES_ARG)
54+
cd native && RUSTFLAGS="-Ctarget-cpu=x86-64-v3" cargo build -j 2 --release $(FEATURES_ARG)
5555
ifdef HAS_OSXCROSS
5656
rustup target add x86_64-apple-darwin
5757
cd native && cargo build -j 2 --target x86_64-apple-darwin --release $(FEATURES_ARG)
5858
endif
5959

6060
# build native libs for arm64 architecture Linux/MacOS on a Linux/arm64 machine/container
6161
core-arm64-libs:
62-
cd native && cargo build -j 2 --release $(FEATURES_ARG)
62+
cd native && RUSTFLAGS="-Ctarget-cpu=neoverse-n1" cargo build -j 2 --release $(FEATURES_ARG)
6363
ifdef HAS_OSXCROSS
6464
rustup target add aarch64-apple-darwin
6565
cd native && cargo build -j 2 --target aarch64-apple-darwin --release $(FEATURES_ARG)
6666
endif
6767

6868
core-amd64:
6969
rustup target add x86_64-apple-darwin
70-
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
70+
cd native && RUSTFLAGS="-Ctarget-cpu=skylake" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
7171
mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
7272
cp native/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
73-
cd native && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release $(FEATURES_ARG)
73+
cd native && RUSTFLAGS="-Ctarget-cpu=x86-64-v3" cargo build --release $(FEATURES_ARG)
7474
mkdir -p common/target/classes/org/apache/comet/linux/amd64
7575
cp native/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
7676
jar -cf common/target/comet-native-x86_64.jar \
@@ -83,7 +83,7 @@ core-arm64:
8383
cd native && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release $(FEATURES_ARG)
8484
mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
8585
cp native/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
86-
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)
86+
cd native && RUSTFLAGS="-Ctarget-cpu=neoverse-n1" cargo build --release $(FEATURES_ARG)
8787
mkdir -p common/target/classes/org/apache/comet/linux/aarch64
8888
cp native/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
8989
jar -cf common/target/comet-native-aarch64.jar \
@@ -94,8 +94,8 @@ core-arm64:
9494
release-linux: clean
9595
rustup target add aarch64-apple-darwin x86_64-apple-darwin
9696
cd native && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release $(FEATURES_ARG)
97-
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
98-
cd native && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release $(FEATURES_ARG)
97+
cd native && RUSTFLAGS="-Ctarget-cpu=skylake" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
98+
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)
9999
./mvnw install -Prelease -DskipTests $(PROFILES)
100100
release:
101101
cd native && RUSTFLAGS="$(RUSTFLAGS) -Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)

common/src/main/scala/org/apache/comet/CometConf.scala

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,8 @@ object CometConf extends ShimCometConf {
124124
val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
125125
.category(CATEGORY_SCAN)
126126
.doc(
127-
s"The implementation of Comet Native Scan to use. Available modes are `$SCAN_NATIVE_COMET`," +
127+
"The implementation of Comet Native Scan to use. Available modes are " +
128128
s"`$SCAN_NATIVE_DATAFUSION`, and `$SCAN_NATIVE_ICEBERG_COMPAT`. " +
129-
s"`$SCAN_NATIVE_COMET` (DEPRECATED - will be removed in a future release) is for the " +
130-
"original Comet native scan which uses a jvm based parquet file reader and native " +
131-
"column decoding. Supports simple types only. " +
132129
s"`$SCAN_NATIVE_DATAFUSION` is a fully native implementation of scan based on " +
133130
"DataFusion. " +
134131
s"`$SCAN_NATIVE_ICEBERG_COMPAT` is the recommended native implementation that " +
@@ -137,8 +134,7 @@ object CometConf extends ShimCometConf {
137134
.internal()
138135
.stringConf
139136
.transform(_.toLowerCase(Locale.ROOT))
140-
.checkValues(
141-
Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
137+
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
142138
.createWithEnvVarOrDefault("COMET_PARQUET_SCAN_IMPL", SCAN_AUTO)
143139

144140
val COMET_ICEBERG_NATIVE_ENABLED: ConfigEntry[Boolean] =

dev/diffs/3.4.3.diff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,7 +1157,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
11571157
import org.apache.spark.SparkConf
11581158
import org.apache.spark.sql.{AnalysisException, QueryTest}
11591159
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
1160-
+import org.apache.spark.sql.comet.CometScanExec
1160+
+import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
11611161
import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
11621162
import org.apache.spark.sql.connector.read.ScanBuilder
11631163
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
@@ -1167,7 +1167,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
11671167
assert(
11681168
- df.queryExecution.executedPlan.exists(_.isInstanceOf[FileSourceScanExec]))
11691169
+ df.queryExecution.executedPlan.exists {
1170-
+ case _: FileSourceScanExec | _: CometScanExec => true
1170+
+ case _: FileSourceScanExec | _: CometScanExec | _: CometNativeScanExec => true
11711171
+ case _ => false
11721172
+ }
11731173
+ )

dev/diffs/3.5.8.diff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,7 +1111,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
11111111
import org.apache.spark.SparkConf
11121112
import org.apache.spark.sql.{AnalysisException, QueryTest}
11131113
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
1114-
+import org.apache.spark.sql.comet.CometScanExec
1114+
+import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
11151115
import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
11161116
import org.apache.spark.sql.connector.read.ScanBuilder
11171117
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
@@ -1121,7 +1121,7 @@ index cfc8b2cc845..c6fcfd7bd08 100644
11211121
assert(
11221122
- df.queryExecution.executedPlan.exists(_.isInstanceOf[FileSourceScanExec]))
11231123
+ df.queryExecution.executedPlan.exists {
1124-
+ case _: FileSourceScanExec | _: CometScanExec => true
1124+
+ case _: FileSourceScanExec | _: CometScanExec | _: CometNativeScanExec => true
11251125
+ case _ => false
11261126
+ }
11271127
+ )

dev/diffs/4.0.1.diff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,7 +1443,7 @@ index 2a0ab21ddb0..e8a5a891105 100644
14431443
import org.apache.spark.{SparkConf, SparkException}
14441444
import org.apache.spark.sql.QueryTest
14451445
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
1446-
+import org.apache.spark.sql.comet.CometScanExec
1446+
+import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec}
14471447
import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
14481448
import org.apache.spark.sql.connector.read.ScanBuilder
14491449
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
@@ -1453,7 +1453,7 @@ index 2a0ab21ddb0..e8a5a891105 100644
14531453
assert(
14541454
- df.queryExecution.executedPlan.exists(_.isInstanceOf[FileSourceScanExec]))
14551455
+ df.queryExecution.executedPlan.exists {
1456-
+ case _: FileSourceScanExec | _: CometScanExec => true
1456+
+ case _: FileSourceScanExec | _: CometScanExec | _: CometNativeScanExec => true
14571457
+ case _ => false
14581458
+ }
14591459
+ )

dev/regenerate-golden-files.sh

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,6 @@ build_native() {
7474
cd native && cargo build && cd ..
7575
}
7676

77-
# Install Comet for a specific Spark version
78-
install_for_spark_version() {
79-
local spark_version=$1
80-
echo ""
81-
echo "=============================================="
82-
echo "[INFO] Installing Comet for Spark $spark_version"
83-
echo "=============================================="
84-
./mvnw install -DskipTests -Pspark-$spark_version
85-
}
86-
8777
# Regenerate golden files for a specific Spark version
8878
regenerate_golden_files() {
8979
local spark_version=$1
@@ -94,12 +84,12 @@ regenerate_golden_files() {
9484
echo "=============================================="
9585

9686
echo "[INFO] Running CometTPCDSV1_4_PlanStabilitySuite..."
97-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark \
87+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw \
9888
-Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" \
9989
-Pspark-$spark_version -nsu test
10090

10191
echo "[INFO] Running CometTPCDSV2_7_PlanStabilitySuite..."
102-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark \
92+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw \
10393
-Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" \
10494
-Pspark-$spark_version -nsu test
10595
}
@@ -158,9 +148,8 @@ main() {
158148
versions=("3.4" "3.5" "4.0")
159149
fi
160150

161-
# Install and regenerate for each version
151+
# Regenerate for each version
162152
for version in "${versions[@]}"; do
163-
install_for_spark_version "$version"
164153
regenerate_golden_files "$version"
165154
done
166155

docs/source/contributor-guide/development.md

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -191,52 +191,43 @@ Spark version, and runs the plan stability tests with `SPARK_GENERATE_GOLDEN_FIL
191191

192192
Alternatively, you can run the tests manually using the following commands.
193193

194-
First, Comet needs to be installed for each Spark version to be tested:
195-
196-
```sh
197-
./mvnw install -DskipTests -Pspark-3.4
198-
./mvnw install -DskipTests -Pspark-3.5
199-
# note that Spark 4.0 requires JDK 17 or later
200-
./mvnw install -DskipTests -Pspark-4.0
201-
```
202-
203194
Note that the output files get written to `$SPARK_HOME`.
204195

205196
The tests can be run with:
206197

207198
```sh
208199
export SPARK_HOME=`pwd`
209-
./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
210-
./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
211-
./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
200+
./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
201+
./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
202+
./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
212203
```
213204

214205
and
215206

216207
```sh
217208
export SPARK_HOME=`pwd`
218-
./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
219-
./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
220-
./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
209+
./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
210+
./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
211+
./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
221212
```
222213

223214
If your pull request changes the query plans generated by Comet, you should regenerate the golden files.
224215
To regenerate the golden files, you can run the following commands.
225216

226217
```sh
227218
export SPARK_HOME=`pwd`
228-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
229-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
230-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
219+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
220+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
221+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
231222
```
232223

233224
and
234225

235226
```sh
236227
export SPARK_HOME=`pwd`
237-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
238-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
239-
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
228+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
229+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
230+
SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
240231
```
241232

242233
## Benchmark

0 commit comments

Comments
 (0)