diff --git a/.github/actions/rust-test/action.yaml b/.github/actions/rust-test/action.yaml index 10fc1375f0..c39c2dcd4f 100644 --- a/.github/actions/rust-test/action.yaml +++ b/.github/actions/rust-test/action.yaml @@ -21,11 +21,7 @@ description: "Run Rust tests" runs: using: "composite" steps: - - name: Check Cargo fmt - shell: bash - run: | - cd native - cargo fmt --all -- --check --color=never + # Note: cargo fmt check is now handled by the lint job that gates this workflow - name: Check Cargo clippy shell: bash diff --git a/.github/workflows/benchmark-tpcds.yml b/.github/workflows/benchmark-tpcds.yml deleted file mode 100644 index db1fce0192..0000000000 --- a/.github/workflows/benchmark-tpcds.yml +++ /dev/null @@ -1,155 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: TPC-DS Correctness - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - paths-ignore: - - "doc/**" - - "docs/**" - - "**.md" - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - pull_request: - paths-ignore: - - "doc/**" - - "docs/**" - - "**.md" - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - # manual trigger - # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow - workflow_dispatch: - -env: - RUST_VERSION: stable - -jobs: - prepare: - name: Build native and prepare data - runs-on: ubuntu-latest - container: - image: amd64/rust - env: - JAVA_VERSION: 11 - steps: - - uses: actions/checkout@v6 - - name: Setup Rust & Java toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{env.RUST_VERSION}} - jdk-version: 11 - - name: Cache Maven dependencies - uses: actions/cache@v5 - with: - path: | - ~/.m2/repository - /root/.m2/repository - key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-java-maven- - - name: Build Comet - run: make release - - name: Cache TPC-DS generated data - id: cache-tpcds-sf-1 - uses: actions/cache@v5 - with: - path: ./tpcds-sf-1 - key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }} - - name: Checkout tpcds-kit repository - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - uses: actions/checkout@v6 - with: - repository: databricks/tpcds-kit - path: ./tpcds-kit - - name: Build tpcds-kit - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: | - apt-get install -y yacc bison flex gcc-12 g++-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 120 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - gcc --version - cd tpcds-kit/tools && make OS=LINUX - - name: Generate TPC-DS (SF=1) table data - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: | - cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1" - cd .. - - benchmark: - name: Run TPCDSQuerySuite - runs-on: ubuntu-latest - needs: [prepare] - container: - image: amd64/rust - strategy: - matrix: - join: [sort_merge, broadcast, hash] - steps: - - uses: actions/checkout@v6 - - name: Setup Rust & Java toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{env.RUST_VERSION}} - jdk-version: 11 - - name: Cache Maven dependencies - uses: actions/cache@v5 - with: - path: | - ~/.m2/repository - /root/.m2/repository - key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-java-maven- - - name: Restore TPC-DS generated data - id: cache-tpcds-sf-1 - uses: actions/cache/restore@v5 - with: - path: ./tpcds-sf-1 - key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }} - fail-on-cache-miss: true # it's always be cached as it should be generated by pre-step if not existed - - name: Build Comet - run: make release - - name: Run TPC-DS queries (Sort merge join) - if: matrix.join == 'sort_merge' - run: | - SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test - env: - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=-1 - spark.sql.join.preferSortMergeJoin=true - - name: Run TPC-DS queries (Broadcast hash join) - if: matrix.join == 'broadcast' - run: | - SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test - env: - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=10485760 - - name: Run TPC-DS queries (Shuffled hash join) - if: matrix.join == 'hash' - run: | - SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test - env: - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=-1 - spark.sql.join.forceApplyShuffledHashJoin=true diff --git a/.github/workflows/benchmark-tpch.yml b/.github/workflows/benchmark-tpch.yml deleted file mode 100644 index 124b0d0c78..0000000000 --- a/.github/workflows/benchmark-tpch.yml +++ /dev/null @@ -1,119 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: TPC-H Correctness - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - paths-ignore: - - "doc/**" - - "docs/**" - - "**.md" - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - pull_request: - paths-ignore: - - "doc/**" - - "docs/**" - - "**.md" - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - # manual trigger - # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow - workflow_dispatch: - -env: - RUST_VERSION: stable - -jobs: - prepare: - name: Build native and prepare data - runs-on: ubuntu-latest - container: - image: amd64/rust - env: - JAVA_VERSION: 11 - steps: - - uses: actions/checkout@v6 - - name: Setup Rust & Java toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{env.RUST_VERSION}} - jdk-version: 11 - - name: Cache Maven dependencies - uses: actions/cache@v5 - with: - path: | - ~/.m2/repository - /root/.m2/repository - key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-java-maven- - - name: Cache TPC-H generated data - id: cache-tpch-sf-1 - uses: actions/cache@v5 - with: - path: ./tpch - key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }} - - name: Build Comet - run: make release - - name: Generate TPC-H (SF=1) table data - if: steps.cache-tpch-sf-1.outputs.cache-hit != 'true' - run: | - cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCHData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--location `pwd`/.. --scaleFactor 1 --numPartitions 1 --overwrite" - cd .. - - benchmark: - name: Run TPCHQuerySuite - runs-on: ubuntu-latest - needs: [prepare] - container: - image: amd64/rust - steps: - - uses: actions/checkout@v6 - - name: Setup Rust & Java toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{env.RUST_VERSION}} - jdk-version: 11 - - name: Cache Maven dependencies - uses: actions/cache@v5 - with: - path: | - ~/.m2/repository - /root/.m2/repository - key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-java-maven- - - name: Restore TPC-H generated data - id: cache-tpch-sf-1 - uses: actions/cache/restore@v5 - with: - path: ./tpch - key: tpch-${{ hashFiles('.github/workflows/benchmark-tpch.yml') }} - fail-on-cache-miss: true # it's always be cached as it should be generated by pre-step if not existed - - name: Build Comet - run: make release - - name: Run TPC-H queries - run: | - SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index 4a0b277618..bb7e917af7 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -47,8 +47,23 @@ env: jobs: + # Fast lint check - gates all other jobs + lint: + name: Lint + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v6 + + - name: Check Rust formatting + run: | + rustup component add rustfmt + cd native && cargo fmt --all -- --check + # Build native library once and share with all test jobs build-native: + needs: lint name: Build Native Library runs-on: ubuntu-latest container: @@ -62,8 +77,8 @@ jobs: rust-version: ${{ env.RUST_VERSION }} jdk-version: 17 # JDK only needed for common module proto generation - - name: Cache Cargo - uses: actions/cache@v4 + - name: Restore Cargo cache + uses: actions/cache/restore@v4 with: path: | ~/.cargo/registry @@ -87,15 +102,21 @@ jobs: path: native/target/ci/libcomet.so retention-days: 1 + - name: Save Cargo cache + uses: actions/cache/save@v4 + if: github.ref == 'refs/heads/main' + with: + path: | + ~/.cargo/registry + ~/.cargo/git + native/target + key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }} + # Run Rust tests (runs in parallel with build-native, uses debug builds) linux-test-rust: - strategy: - matrix: - os: [ubuntu-latest] - java_version: [11, 17] - fail-fast: false - name: ${{ matrix.os }}/java ${{ matrix.java_version }}-rust - runs-on: ${{ matrix.os }} + needs: lint + name: ubuntu-latest/rust-test + runs-on: ubuntu-latest container: image: amd64/rust steps: @@ -105,22 +126,33 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ env.RUST_VERSION }} - jdk-version: ${{ matrix.java_version }} + jdk-version: 17 - - name: Cache Cargo - uses: actions/cache@v4 + - name: Restore Cargo cache + uses: actions/cache/restore@v4 with: path: | ~/.cargo/registry ~/.cargo/git native/target - key: ${{ runner.os }}-cargo-debug-java${{ matrix.java_version }}-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }} + # Note: Java version intentionally excluded - Rust target is JDK-independent + key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }} restore-keys: | - ${{ runner.os }}-cargo-debug-java${{ matrix.java_version }}- + ${{ runner.os }}-cargo-debug- - name: Rust test steps uses: ./.github/actions/rust-test + - name: Save Cargo cache + uses: actions/cache/save@v4 + if: github.ref == 'refs/heads/main' + with: + path: | + ~/.cargo/registry + ~/.cargo/git + native/target + key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }} + linux-test: needs: build-native strategy: @@ -277,3 +309,147 @@ jobs: scan_impl: ${{ matrix.profile.scan_impl }} upload-test-reports: true skip-native-build: true + + # TPC-H correctness test - verifies benchmark queries produce correct results + verify-benchmark-results-tpch: + needs: build-native + name: Verify TPC-H Results + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.RUST_VERSION }} + jdk-version: 11 + + - name: Download native library + uses: actions/download-artifact@v4 + with: + name: native-lib-linux + path: native/target/release/ + + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-java-maven- + + - name: Cache TPC-H data + id: cache-tpch + uses: actions/cache@v4 + with: + path: ./tpch + key: tpch-${{ hashFiles('.github/workflows/pr_build_linux.yml') }} + + - name: Build project + run: | + ./mvnw -B -Prelease compile test-compile -DskipTests + + - name: Generate TPC-H data (SF=1) + if: steps.cache-tpch.outputs.cache-hit != 'true' + run: | + cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCHData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--location `pwd`/.. --scaleFactor 1 --numPartitions 1 --overwrite" + + - name: Run TPC-H queries + run: | + SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test + + # TPC-DS correctness tests - verifies benchmark queries produce correct results + verify-benchmark-results-tpcds: + needs: build-native + name: Verify TPC-DS Results (${{ matrix.join }}) + runs-on: ubuntu-latest + container: + image: amd64/rust + strategy: + matrix: + join: [sort_merge, broadcast, hash] + fail-fast: false + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.RUST_VERSION }} + jdk-version: 11 + + - name: Download native library + uses: actions/download-artifact@v4 + with: + name: native-lib-linux + path: native/target/release/ + + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-java-maven- + + - name: Cache TPC-DS data + id: cache-tpcds + uses: actions/cache@v4 + with: + path: ./tpcds-sf-1 + key: tpcds-${{ hashFiles('.github/workflows/pr_build_linux.yml') }} + + - name: Build project + run: | + ./mvnw -B -Prelease compile test-compile -DskipTests + + - name: Checkout tpcds-kit + if: steps.cache-tpcds.outputs.cache-hit != 'true' + uses: actions/checkout@v6 + with: + repository: databricks/tpcds-kit + path: ./tpcds-kit + + - name: Build tpcds-kit + if: steps.cache-tpcds.outputs.cache-hit != 'true' + run: | + apt-get update && apt-get install -y yacc bison flex gcc-12 g++-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 120 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + cd tpcds-kit/tools && make OS=LINUX + + - name: Generate TPC-DS data (SF=1) + if: steps.cache-tpcds.outputs.cache-hit != 'true' + run: | + cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1" + + - name: Run TPC-DS queries (Sort merge join) + if: matrix.join == 'sort_merge' + run: | + SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test + env: + SPARK_TPCDS_JOIN_CONF: | + spark.sql.autoBroadcastJoinThreshold=-1 + spark.sql.join.preferSortMergeJoin=true + + - name: Run TPC-DS queries (Broadcast hash join) + if: matrix.join == 'broadcast' + run: | + SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test + env: + SPARK_TPCDS_JOIN_CONF: | + spark.sql.autoBroadcastJoinThreshold=10485760 + + - name: Run TPC-DS queries (Shuffled hash join) + if: matrix.join == 'hash' + run: | + SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test + env: + SPARK_TPCDS_JOIN_CONF: | + spark.sql.autoBroadcastJoinThreshold=-1 + spark.sql.join.forceApplyShuffledHashJoin=true diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index 37e6234569..d76edc008d 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -47,8 +47,23 @@ env: jobs: + # Fast lint check - gates all other jobs (runs on Linux for cost efficiency) + lint: + name: Lint + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v6 + + - name: Check Rust formatting + run: | + rustup component add rustfmt + cd native && cargo fmt --all -- --check + # Build native library once and share with all test jobs build-native: + needs: lint name: Build Native Library (macOS) runs-on: macos-14 steps: @@ -62,8 +77,8 @@ jobs: jdk-architecture: aarch64 protoc-architecture: aarch_64 - - name: Cache Cargo - uses: actions/cache@v4 + - name: Restore Cargo cache + uses: actions/cache/restore@v4 with: path: | ~/.cargo/registry @@ -87,6 +102,16 @@ jobs: path: native/target/ci/libcomet.dylib retention-days: 1 + - name: Save Cargo cache + uses: actions/cache/save@v4 + if: github.ref == 'refs/heads/main' + with: + path: | + ~/.cargo/registry + ~/.cargo/git + native/target + key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }} + macos-aarch64-test: needs: build-native strategy: diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index 955fc69279..fd5429383b 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -68,8 +68,8 @@ jobs: rust-version: ${{ env.RUST_VERSION }} jdk-version: 17 - - name: Cache Cargo - uses: actions/cache@v4 + - name: Restore Cargo cache + uses: actions/cache/restore@v4 with: path: | ~/.cargo/registry @@ -91,6 +91,16 @@ jobs: path: native/target/ci/libcomet.so retention-days: 1 + - name: Save Cargo cache + uses: actions/cache/save@v4 + if: github.ref == 'refs/heads/main' + with: + path: | + ~/.cargo/registry + ~/.cargo/git + native/target + key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }} + spark-sql-auto-scan: needs: build-native strategy: diff --git a/native/Cargo.lock b/native/Cargo.lock index ce0eb0f2b3..2e53b3c274 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -418,6 +418,23 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-compression" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +dependencies = [ + "bzip2 0.5.2", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + [[package]] name = "async-executor" version = "1.13.3" @@ -1189,6 +1206,34 @@ dependencies = [ "either", ] +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cast" version = "0.3.0" @@ -1784,6 +1829,7 @@ dependencies = [ "datafusion-comet-objectstore-hdfs", "datafusion-comet-proto", "datafusion-comet-spark-expr", + "datafusion-datasource", "datafusion-functions-nested", "datafusion-spark", "futures", @@ -1925,8 +1971,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fde13794244bc7581cd82f6fff217068ed79cdc344cafe4ab2c3a1c3510b38d6" dependencies = [ "arrow", + "async-compression", "async-trait", "bytes", + "bzip2 0.6.1", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1937,6 +1985,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", + "flate2", "futures", "glob", "itertools 0.14.0", @@ -1944,7 +1993,10 @@ dependencies = [ "object_store", "rand 0.9.2", "tokio", + "tokio-util", "url", + "xz2", + "zstd", ] [[package]] @@ -3625,6 +3677,12 @@ dependencies = [ "lexical-util", ] +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.180" @@ -3754,6 +3812,17 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "md-5" version = "0.10.6" @@ -6573,6 +6642,15 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.8.1" diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index 5e30883e35..b13d6d54fd 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -60,6 +60,7 @@ tempfile = "3.24.0" itertools = "0.14.0" paste = "1.0.14" datafusion = { workspace = true, features = ["parquet_encryption", "sql"] } +datafusion-datasource = { workspace = true } datafusion-spark = { workspace = true } once_cell = "1.18.0" regex = { workspace = true }