Skip to content

Commit 4f46d03

Browse files
committed
Merge remote-tracking branch 'apache/main' into upgrade-plugins
2 parents 7613c94 + 8295024 commit 4f46d03

17 files changed

Lines changed: 120 additions & 179 deletions

File tree

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ jobs:
6363
key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
6464
restore-keys: |
6565
${{ runner.os }}-java-maven-
66-
66+
- name: Build Comet
67+
run: make release
6768
- name: Cache TPC-DS generated data
6869
id: cache-tpcds-sf-1
6970
uses: actions/cache@v4
@@ -76,17 +77,6 @@ jobs:
7677
with:
7778
repository: databricks/tpcds-kit
7879
path: ./tpcds-kit
79-
- name: Build Comet
80-
run: make release
81-
- name: Upload Comet native lib
82-
uses: actions/upload-artifact@v4
83-
with:
84-
name: libcomet-${{ github.run_id }}
85-
path: |
86-
native/target/release/libcomet.so
87-
native/target/release/libcomet.dylib
88-
retention-days: 1 # remove the artifact after 1 day, only valid for this workflow
89-
overwrite: true
9080
- name: Build tpcds-kit
9181
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
9282
run: |
@@ -132,11 +122,8 @@ jobs:
132122
path: ./tpcds-sf-1
133123
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
134124
fail-on-cache-miss: true # it's always be cached as it should be generated by pre-step if not existed
135-
- name: Download Comet native lib
136-
uses: actions/download-artifact@v5
137-
with:
138-
name: libcomet-${{ github.run_id }}
139-
path: native/target/release
125+
- name: Build Comet
126+
run: make release
140127
- name: Run TPC-DS queries (Sort merge join)
141128
if: matrix.join == 'sort_merge'
142129
run: |

.github/workflows/benchmark-tpch.yml

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,6 @@ jobs:
7171
key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}
7272
- name: Build Comet
7373
run: make release
74-
- name: Upload Comet native lib
75-
uses: actions/upload-artifact@v4
76-
with:
77-
name: libcomet-${{ github.run_id }}
78-
path: |
79-
native/target/release/libcomet.so
80-
native/target/release/libcomet.dylib
81-
retention-days: 1 # remove the artifact after 1 day, only valid for this workflow
82-
overwrite: true
8374
- name: Generate TPC-H (SF=1) table data
8475
if: steps.cache-tpch-sf-1.outputs.cache-hit != 'true'
8576
run: |
@@ -115,11 +106,8 @@ jobs:
115106
path: ./tpch
116107
key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}
117108
fail-on-cache-miss: true # it's always be cached as it should be generated by pre-step if not existed
118-
- name: Download Comet native lib
119-
uses: actions/download-artifact@v5
120-
with:
121-
name: libcomet-${{ github.run_id }}
122-
path: native/target/release
109+
- name: Build Comet
110+
run: make release
123111
- name: Run TPC-H queries
124112
run: |
125113
SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test

dev/benchmarks/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ under the License.
2222
This directory contains scripts used for generating benchmark results that are published in this repository and in
2323
the Comet documentation.
2424

25+
For full instructions on running these benchmarks on an EC2 instance, see the [Comet Benchmarking on EC2 Guide].
26+
27+
[Comet Benchmarking on EC2 Guide]: https://datafusion.apache.org/comet/contributor-guide/benchmarking_aws_ec2.html
28+
2529
## Example usage
2630

2731
Set Spark environment variables:
@@ -50,7 +54,7 @@ Run Comet benchmark:
5054

5155
```shell
5256
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
53-
export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.9.0.jar
57+
export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar
5458
sudo ./drop-caches.sh
5559
./comet-tpch.sh
5660
```

dev/benchmarks/blaze-tpcds.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242
--conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \
4343
--conf spark.blaze.enable=true \
4444
--conf spark.blaze.forceShuffledHashJoin=true \
45+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547
tpcbench.py \
4648
--name blaze \
4749
--benchmark tpcds \

dev/benchmarks/blaze-tpch.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242
--conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \
4343
--conf spark.blaze.enable=true \
4444
--conf spark.blaze.forceShuffledHashJoin=true \
45+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547
tpcbench.py \
4648
--name blaze \
4749
--benchmark tpch \

dev/benchmarks/comet-tpcds.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ $SPARK_HOME/bin/spark-submit \
4141
--conf spark.plugins=org.apache.spark.CometPlugin \
4242
--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
4343
--conf spark.comet.expression.allowIncompatible=true \
44-
--conf spark.comet.scan.impl=native_datafusion \
44+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
45+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4546
tpcbench.py \
4647
--name comet \
4748
--benchmark tpcds \

dev/benchmarks/comet-tpch.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242
--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
4343
--conf spark.comet.exec.replaceSortMergeJoin=true \
4444
--conf spark.comet.expression.allowIncompatible=true \
45-
--conf spark.comet.scan.impl=native_datafusion \
45+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4647
tpcbench.py \
4748
--name comet \
4849
--benchmark tpch \

dev/benchmarks/gluten-tpcds.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242
--conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \
4343
--conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
4444
--conf spark.sql.session.timeZone=UTC \
45+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547
tpcbench.py \
4648
--name gluten \
4749
--benchmark tpcds \

dev/benchmarks/gluten-tpch.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242
--conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \
4343
--conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
4444
--conf spark.sql.session.timeZone=UTC \
45+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547
tpcbench.py \
4648
--name gluten \
4749
--benchmark tpch \

dev/benchmarks/spark-tpcds.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ $SPARK_HOME/bin/spark-submit \
3434
--conf spark.memory.offHeap.enabled=true \
3535
--conf spark.memory.offHeap.size=16g \
3636
--conf spark.eventLog.enabled=true \
37+
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
38+
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
3739
tpcbench.py \
3840
--name spark \
3941
--benchmark tpcds \

0 commit comments

Comments
 (0)