Skip to content

Commit f8c3a64

Browse files
authored
Merge branch 'main' into cast_module_refactor_boolean
2 parents fb9edf0 + c73ac2e commit f8c3a64

66 files changed

Lines changed: 3985 additions & 1229 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/iceberg_spark_test.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ jobs:
6969
~/.cargo/registry
7070
~/.cargo/git
7171
native/target
72-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
72+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
7373
restore-keys: |
74-
${{ runner.os }}-cargo-ci-
74+
${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-
7575
7676
- name: Build native library
7777
# Use CI profile for faster builds (no LTO) and to share cache with pr_build_linux.yml.
@@ -88,7 +88,7 @@ jobs:
8888
~/.cargo/registry
8989
~/.cargo/git
9090
native/target
91-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
91+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
9292

9393
- name: Upload native library
9494
uses: actions/upload-artifact@v6

.github/workflows/pr_build_linux.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ jobs:
8484
~/.cargo/registry
8585
~/.cargo/git
8686
native/target
87-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
87+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
8888
restore-keys: |
89-
${{ runner.os }}-cargo-ci-
89+
${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-
9090
9191
- name: Build native library (CI profile)
9292
run: |
@@ -112,7 +112,7 @@ jobs:
112112
~/.cargo/registry
113113
~/.cargo/git
114114
native/target
115-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
115+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
116116

117117
# Run Rust tests (runs in parallel with build-native, uses debug builds)
118118
linux-test-rust:
@@ -138,9 +138,9 @@ jobs:
138138
~/.cargo/git
139139
native/target
140140
# Note: Java version intentionally excluded - Rust target is JDK-independent
141-
key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
141+
key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
142142
restore-keys: |
143-
${{ runner.os }}-cargo-debug-
143+
${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-
144144
145145
- name: Rust test steps
146146
uses: ./.github/actions/rust-test
@@ -153,7 +153,7 @@ jobs:
153153
~/.cargo/registry
154154
~/.cargo/git
155155
native/target
156-
key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
156+
key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
157157

158158
linux-test:
159159
needs: build-native

.github/workflows/pr_build_macos.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ jobs:
8484
~/.cargo/registry
8585
~/.cargo/git
8686
native/target
87-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
87+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
8888
restore-keys: |
89-
${{ runner.os }}-cargo-ci-
89+
${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-
9090
9191
- name: Build native library (CI profile)
9292
run: |
@@ -112,7 +112,7 @@ jobs:
112112
~/.cargo/registry
113113
~/.cargo/git
114114
native/target
115-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
115+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
116116

117117
macos-aarch64-test:
118118
needs: build-native

.github/workflows/spark_sql_test.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ jobs:
7575
~/.cargo/registry
7676
~/.cargo/git
7777
native/target
78-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
78+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
7979
restore-keys: |
80-
${{ runner.os }}-cargo-ci-
80+
${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-
8181
8282
- name: Build native library (CI profile)
8383
run: |
@@ -101,7 +101,7 @@ jobs:
101101
~/.cargo/registry
102102
~/.cargo/git
103103
native/target
104-
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}
104+
key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }}
105105

106106
spark-sql-test:
107107
needs: build-native

.github/workflows/stale.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
name: "Close stale PRs"
19+
on:
20+
schedule:
21+
- cron: "30 1 * * *"
22+
23+
jobs:
24+
close-stale-prs:
25+
runs-on: ubuntu-latest
26+
permissions:
27+
issues: write
28+
pull-requests: write
29+
steps:
30+
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
31+
with:
32+
stale-pr-message: "Thank you for your contribution. Unfortunately, this pull request is stale because it has been open 60 days with no activity. Please remove the stale label or comment or this will be closed in 7 days."
33+
days-before-pr-stale: 60
34+
days-before-pr-close: 7
35+
# do not close stale issues
36+
days-before-issue-stale: -1
37+
days-before-issue-close: -1
38+
repo-token: ${{ secrets.GITHUB_TOKEN }}

dev/benchmarks/README.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,79 @@ Generating charts:
7373
```shell
7474
python3 generate-comparison.py --benchmark tpch --labels "Spark 3.5.3" "Comet 0.9.0" "Gluten 1.4.0" --title "TPC-H @ 100 GB (single executor, 8 cores, local Parquet files)" spark-tpch-1752338506381.json comet-tpch-1752337818039.json gluten-tpch-1752337474344.json
7575
```
76+
77+
## Iceberg Benchmarking
78+
79+
Comet includes native Iceberg support via iceberg-rust integration. This enables benchmarking TPC-H queries
80+
against Iceberg tables with native scan acceleration.
81+
82+
### Prerequisites
83+
84+
Download the Iceberg Spark runtime JAR (required for running the benchmark):
85+
86+
```shell
87+
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
88+
export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
89+
```
90+
91+
Note: Table creation uses `--packages` which auto-downloads the dependency.
92+
93+
### Create Iceberg TPC-H tables
94+
95+
Convert existing Parquet TPC-H data to Iceberg format:
96+
97+
```shell
98+
export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
99+
export ICEBERG_CATALOG=${ICEBERG_CATALOG:-local}
100+
101+
$SPARK_HOME/bin/spark-submit \
102+
--master $SPARK_MASTER \
103+
--packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
104+
--conf spark.driver.memory=8G \
105+
--conf spark.executor.instances=1 \
106+
--conf spark.executor.cores=8 \
107+
--conf spark.cores.max=8 \
108+
--conf spark.executor.memory=16g \
109+
--conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \
110+
--conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \
111+
--conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \
112+
create-iceberg-tpch.py \
113+
--parquet-path $TPCH_DATA \
114+
--catalog $ICEBERG_CATALOG \
115+
--database tpch
116+
```
117+
118+
### Run Iceberg benchmark
119+
120+
```shell
121+
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
122+
export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar
123+
export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
124+
export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
125+
export TPCH_QUERIES=/mnt/bigdata/tpch/queries/
126+
sudo ./drop-caches.sh
127+
./comet-tpch-iceberg.sh
128+
```
129+
130+
The benchmark uses `spark.comet.scan.icebergNative.enabled=true` to enable Comet's native iceberg-rust
131+
integration. Verify native scanning is active by checking for `CometIcebergNativeScanExec` in the
132+
physical plan output.
133+
134+
### Iceberg-specific options
135+
136+
| Environment Variable | Default | Description |
137+
| -------------------- | ---------- | ----------------------------------- |
138+
| `ICEBERG_CATALOG` | `local` | Iceberg catalog name |
139+
| `ICEBERG_DATABASE` | `tpch` | Database containing TPC-H tables |
140+
| `ICEBERG_WAREHOUSE` | (required) | Path to Iceberg warehouse directory |
141+
142+
### Comparing Parquet vs Iceberg performance
143+
144+
Run both benchmarks and compare:
145+
146+
```shell
147+
python3 generate-comparison.py --benchmark tpch \
148+
--labels "Comet (Parquet)" "Comet (Iceberg)" \
149+
--title "TPC-H @ 100 GB: Parquet vs Iceberg" \
150+
comet-tpch-*.json comet-iceberg-tpch-*.json
151+
```

dev/benchmarks/comet-tpcds.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ $SPARK_HOME/bin/spark-submit \
4040
--conf spark.executor.extraClassPath=$COMET_JAR \
4141
--conf spark.plugins=org.apache.spark.CometPlugin \
4242
--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
43+
--conf spark.comet.scan.impl=native_datafusion \
4344
--conf spark.comet.expression.Cast.allowIncompatible=true \
4445
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
4546
--conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#!/bin/bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
21+
# TPC-H benchmark using Iceberg tables with Comet's native iceberg-rust integration.
22+
#
23+
# Required environment variables:
24+
# SPARK_HOME - Path to Spark installation
25+
# SPARK_MASTER - Spark master URL (e.g., spark://localhost:7077)
26+
# COMET_JAR - Path to Comet JAR
27+
# ICEBERG_JAR - Path to Iceberg Spark runtime JAR
28+
# ICEBERG_WAREHOUSE - Path to Iceberg warehouse directory
29+
# TPCH_QUERIES - Path to TPC-H query files
30+
#
31+
# Optional:
32+
# ICEBERG_CATALOG - Catalog name (default: local)
33+
# ICEBERG_DATABASE - Database name (default: tpch)
34+
#
35+
# Setup (run once to create Iceberg tables from Parquet):
36+
# $SPARK_HOME/bin/spark-submit \
37+
# --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
38+
# --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
39+
# --conf spark.sql.catalog.local.type=hadoop \
40+
# --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \
41+
# create-iceberg-tpch.py \
42+
# --parquet-path $TPCH_DATA \
43+
# --catalog local \
44+
# --database tpch
45+
46+
set -e
47+
48+
# Defaults
49+
ICEBERG_CATALOG=${ICEBERG_CATALOG:-local}
50+
ICEBERG_DATABASE=${ICEBERG_DATABASE:-tpch}
51+
52+
# Validate required variables
53+
if [ -z "$SPARK_HOME" ]; then
54+
echo "Error: SPARK_HOME is not set"
55+
exit 1
56+
fi
57+
if [ -z "$COMET_JAR" ]; then
58+
echo "Error: COMET_JAR is not set"
59+
exit 1
60+
fi
61+
if [ -z "$ICEBERG_JAR" ]; then
62+
echo "Error: ICEBERG_JAR is not set"
63+
echo "Download from: https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/"
64+
exit 1
65+
fi
66+
if [ -z "$ICEBERG_WAREHOUSE" ]; then
67+
echo "Error: ICEBERG_WAREHOUSE is not set"
68+
exit 1
69+
fi
70+
if [ -z "$TPCH_QUERIES" ]; then
71+
echo "Error: TPCH_QUERIES is not set"
72+
exit 1
73+
fi
74+
75+
$SPARK_HOME/sbin/stop-master.sh 2>/dev/null || true
76+
$SPARK_HOME/sbin/stop-worker.sh 2>/dev/null || true
77+
78+
$SPARK_HOME/sbin/start-master.sh
79+
$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
80+
81+
$SPARK_HOME/bin/spark-submit \
82+
--master $SPARK_MASTER \
83+
--jars $COMET_JAR,$ICEBERG_JAR \
84+
--driver-class-path $COMET_JAR:$ICEBERG_JAR \
85+
--conf spark.driver.memory=8G \
86+
--conf spark.executor.instances=1 \
87+
--conf spark.executor.cores=8 \
88+
--conf spark.cores.max=8 \
89+
--conf spark.executor.memory=16g \
90+
--conf spark.memory.offHeap.enabled=true \
91+
--conf spark.memory.offHeap.size=16g \
92+
--conf spark.eventLog.enabled=true \
93+
--conf spark.driver.extraClassPath=$COMET_JAR:$ICEBERG_JAR \
94+
--conf spark.executor.extraClassPath=$COMET_JAR:$ICEBERG_JAR \
95+
--conf spark.plugins=org.apache.spark.CometPlugin \
96+
--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
97+
--conf spark.comet.exec.replaceSortMergeJoin=true \
98+
--conf spark.comet.expression.Cast.allowIncompatible=true \
99+
--conf spark.comet.enabled=true \
100+
--conf spark.comet.exec.enabled=true \
101+
--conf spark.comet.scan.icebergNative.enabled=true \
102+
--conf spark.comet.explainFallback.enabled=true \
103+
--conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \
104+
--conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \
105+
--conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \
106+
--conf spark.sql.defaultCatalog=${ICEBERG_CATALOG} \
107+
tpcbench.py \
108+
--name comet-iceberg \
109+
--benchmark tpch \
110+
--catalog $ICEBERG_CATALOG \
111+
--database $ICEBERG_DATABASE \
112+
--queries $TPCH_QUERIES \
113+
--output . \
114+
--iterations 1

dev/benchmarks/comet-tpch.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ $SPARK_HOME/bin/spark-submit \
4040
--conf spark.executor.extraClassPath=$COMET_JAR \
4141
--conf spark.plugins=org.apache.spark.CometPlugin \
4242
--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
43+
--conf spark.comet.scan.impl=native_datafusion \
4344
--conf spark.comet.exec.replaceSortMergeJoin=true \
4445
--conf spark.comet.expression.Cast.allowIncompatible=true \
4546
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \

0 commit comments

Comments
 (0)