File tree Expand file tree Collapse file tree
spark/src/main/scala/org/apache/spark/sql/comet Expand file tree Collapse file tree Original file line number Diff line number Diff line change 6363 key : ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
6464 restore-keys : |
6565 ${{ runner.os }}-java-maven-
66-
66+ - name : Build Comet
67+ run : make release
6768 - name : Cache TPC-DS generated data
6869 id : cache-tpcds-sf-1
6970 uses : actions/cache@v4
7677 with :
7778 repository : databricks/tpcds-kit
7879 path : ./tpcds-kit
79- - name : Build Comet
80- run : make release
81- - name : Upload Comet native lib
82- uses : actions/upload-artifact@v4
83- with :
84- name : libcomet-${{ github.run_id }}
85- path : |
86- native/target/release/libcomet.so
87- native/target/release/libcomet.dylib
88- retention-days : 1 # remove the artifact after 1 day, only valid for this workflow
89- overwrite : true
9080 - name : Build tpcds-kit
9181 if : steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
9282 run : |
@@ -132,11 +122,8 @@ jobs:
132122 path : ./tpcds-sf-1
133123 key : tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
134124 fail-on-cache-miss : true # it's always be cached as it should be generated by pre-step if not existed
135- - name : Download Comet native lib
136- uses : actions/download-artifact@v5
137- with :
138- name : libcomet-${{ github.run_id }}
139- path : native/target/release
125+ - name : Build Comet
126+ run : make release
140127 - name : Run TPC-DS queries (Sort merge join)
141128 if : matrix.join == 'sort_merge'
142129 run : |
Original file line number Diff line number Diff line change 7171 key : tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}
7272 - name : Build Comet
7373 run : make release
74- - name : Upload Comet native lib
75- uses : actions/upload-artifact@v4
76- with :
77- name : libcomet-${{ github.run_id }}
78- path : |
79- native/target/release/libcomet.so
80- native/target/release/libcomet.dylib
81- retention-days : 1 # remove the artifact after 1 day, only valid for this workflow
82- overwrite : true
8374 - name : Generate TPC-H (SF=1) table data
8475 if : steps.cache-tpch-sf-1.outputs.cache-hit != 'true'
8576 run : |
@@ -115,11 +106,8 @@ jobs:
115106 path : ./tpch
116107 key : tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }}
117108 fail-on-cache-miss : true # it's always be cached as it should be generated by pre-step if not existed
118- - name : Download Comet native lib
119- uses : actions/download-artifact@v5
120- with :
121- name : libcomet-${{ github.run_id }}
122- path : native/target/release
109+ - name : Build Comet
110+ run : make release
123111 - name : Run TPC-H queries
124112 run : |
125113 SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test
Original file line number Diff line number Diff line change @@ -22,6 +22,10 @@ under the License.
2222This directory contains scripts used for generating benchmark results that are published in this repository and in
2323the Comet documentation.
2424
25+ For full instructions on running these benchmarks on an EC2 instance, see the [ Comet Benchmarking on EC2 Guide] .
26+
27+ [ Comet Benchmarking on EC2 Guide ] : https://datafusion.apache.org/comet/contributor-guide/benchmarking_aws_ec2.html
28+
2529## Example usage
2630
2731Set Spark environment variables:
@@ -50,7 +54,7 @@ Run Comet benchmark:
5054
5155``` shell
5256export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
53- export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.9 .0.jar
57+ export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10 .0.jar
5458sudo ./drop-caches.sh
5559./comet-tpch.sh
5660```
Original file line number Diff line number Diff line change @@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242 --conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \
4343 --conf spark.blaze.enable=true \
4444 --conf spark.blaze.forceShuffledHashJoin=true \
45+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547 tpcbench.py \
4648 --name blaze \
4749 --benchmark tpcds \
Original file line number Diff line number Diff line change @@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242 --conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \
4343 --conf spark.blaze.enable=true \
4444 --conf spark.blaze.forceShuffledHashJoin=true \
45+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547 tpcbench.py \
4648 --name blaze \
4749 --benchmark tpch \
Original file line number Diff line number Diff line change @@ -41,7 +41,8 @@ $SPARK_HOME/bin/spark-submit \
4141 --conf spark.plugins=org.apache.spark.CometPlugin \
4242 --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
4343 --conf spark.comet.expression.allowIncompatible=true \
44- --conf spark.comet.scan.impl=native_datafusion \
44+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
45+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4546 tpcbench.py \
4647 --name comet \
4748 --benchmark tpcds \
Original file line number Diff line number Diff line change @@ -42,7 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242 --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
4343 --conf spark.comet.exec.replaceSortMergeJoin=true \
4444 --conf spark.comet.expression.allowIncompatible=true \
45- --conf spark.comet.scan.impl=native_datafusion \
45+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4647 tpcbench.py \
4748 --name comet \
4849 --benchmark tpch \
Original file line number Diff line number Diff line change @@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242 --conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \
4343 --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
4444 --conf spark.sql.session.timeZone=UTC \
45+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547 tpcbench.py \
4648 --name gluten \
4749 --benchmark tpcds \
Original file line number Diff line number Diff line change @@ -42,6 +42,8 @@ $SPARK_HOME/bin/spark-submit \
4242 --conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \
4343 --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
4444 --conf spark.sql.session.timeZone=UTC \
45+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
46+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
4547 tpcbench.py \
4648 --name gluten \
4749 --benchmark tpch \
Original file line number Diff line number Diff line change @@ -34,6 +34,8 @@ $SPARK_HOME/bin/spark-submit \
3434 --conf spark.memory.offHeap.enabled=true \
3535 --conf spark.memory.offHeap.size=16g \
3636 --conf spark.eventLog.enabled=true \
37+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
38+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
3739 tpcbench.py \
3840 --name spark \
3941 --benchmark tpcds \
You can’t perform that action at this time.
0 commit comments