diff --git a/.github/actions/setup-rust/action.yml b/.github/actions/setup-rust/action.yml index 5fea9acaa53..2af54b1c20f 100644 --- a/.github/actions/setup-rust/action.yml +++ b/.github/actions/setup-rust/action.yml @@ -83,13 +83,3 @@ runs: - name: Install Protoc (for lance-encoding build step) if: runner.os != 'Windows' uses: ./.github/actions/setup-protoc - - - name: Install Sweep - shell: bash - if: ${{ inputs.timestamp == 'true' && github.ref_name == 'develop' }} - run: cargo install cargo-sweep - - - name: Timestamp Cache - shell: bash - if: ${{ inputs.timestamp == 'true' && github.ref_name == 'develop' }} - run: cargo sweep --stamp diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index d3de66765dc..34610171419 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -137,82 +137,3 @@ jobs: secrets: inherit with: mode: "pr" - benchmark_matrix: | - [ - { - "id": "clickbench-nvme", - "subcommand": "clickbench", - "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb", - "extra_data_formats": "vortex-compact" - }, - { - "id": "tpch-nvme", - "subcommand": "tpch", - "name": "TPC-H SF=1 on NVME", - "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "1.0" - }, - { - "id": "tpch-s3", - "subcommand": "tpch", - "name": "TPC-H SF=1 on S3", - "local_dir": "vortex-bench/data/tpch/1.0", - "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "1.0" - }, - { - "id": "tpch-nvme-10", - "subcommand": "tpch", - "name": "TPC-H SF=10 on NVME", - "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "10.0" - }, - { - "id": "tpch-s3-10", - "subcommand": "tpch", - "name": "TPC-H SF=10 on S3", - "local_dir": "vortex-bench/data/tpch/10.0", - "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "10.0" - }, - { - "id": "tpcds-nvme", - "subcommand": "tpcds", - "name": "TPC-DS SF=1 on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "1.0" - }, - { - "id": "statpopgen", - "subcommand": "statpopgen", - "name": "Statistical and Population Genetics", - "targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "100" - }, - { - "id": "fineweb", - "subcommand": "fineweb", - "name": "FineWeb NVMe", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "100" - }, - { - "id": "fineweb-s3", - "subcommand": "fineweb", - "name": "FineWeb S3", - "local_dir": "vortex-bench/data/fineweb", - "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/fineweb/", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "100" - }, - { - "id": "polarsignals", - "subcommand": "polarsignals", - "name": "PolarSignals Profiling", - "targets": "datafusion:vortex", - "scale_factor": "1" - }, - ] diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index abc80c1d93e..645faf9fb37 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -14,7 +14,7 @@ permissions: jobs: commit-metadata: runs-on: ubuntu-latest - timeout-minutes: 120 + timeout-minutes: 10 steps: - uses: actions/checkout@v6 - name: Setup AWS CLI @@ -118,83 +118,3 @@ jobs: secrets: inherit with: mode: "develop" - benchmark_matrix: | - [ - { - "id": "clickbench-nvme", - "subcommand": "clickbench", - "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "build_lance": true - }, - { - "id": "tpch-nvme", - "subcommand": "tpch", - "name": "TPC-H SF=1 on NVME", - "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "1.0", - "build_lance": true - }, - { - "id": "tpch-s3", - "subcommand": "tpch", - "name": "TPC-H SF=1 on S3", - "local_dir": "vortex-bench/data/tpch/1.0", - "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "1.0" - }, - { - "id": "tpch-nvme-10", - "subcommand": "tpch", - "name": "TPC-H SF=10 on NVME", - "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "10.0", - "build_lance": true - }, - { - "id": "tpch-s3-10", - "subcommand": "tpch", - "name": "TPC-H SF=10 on S3", - "local_dir": "vortex-bench/data/tpch/10.0", - "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "10.0" - }, - { - "id": "tpcds-nvme", - "subcommand": "tpcds", - "name": "TPC-DS SF=1 on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "1.0" - }, - { - "id": "statpopgen", - "subcommand": "statpopgen", - "name": "Statistical and Population Genetics", - "local_dir": "vortex-bench/data/statpopgen", - "targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "100" - }, - { - "id": "fineweb", - "subcommand": "fineweb", - "name": "FineWeb NVMe", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact" - }, - { - "id": "fineweb-s3", - "subcommand": "fineweb", - "name": "FineWeb S3", - "local_dir": "vortex-bench/data/fineweb", - "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/fineweb/", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact" - }, - { - "id": "polarsignals", - "subcommand": "polarsignals", - "name": "PolarSignals Profiling", - "targets": "datafusion:vortex", - "scale_factor": "1" - }, - ] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 290612d099b..5126ed3e392 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ permissions: issues: write # audit-check creates issues env: - CARGO_TERM_COLOR: auto + CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 NIGHTLY_TOOLCHAIN: nightly-2026-02-05 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 6a7b6c47f4f..307526d92f3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -66,7 +66,7 @@ jobs: name: github-pages url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest - timeout-minutes: 120 + timeout-minutes: 10 needs: build steps: # Note, since we provide the job with a CloudFlare scoped API token, we run it in a separate job that doesn't diff --git a/.github/workflows/fuzz-coverage.yml b/.github/workflows/fuzz-coverage.yml index ee4ed865b4b..3aab125d671 100644 --- a/.github/workflows/fuzz-coverage.yml +++ b/.github/workflows/fuzz-coverage.yml @@ -11,6 +11,11 @@ env: jobs: coverage: name: "Coverage: ${{ matrix.fuzz_target }}" + env: + AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" strategy: fail-fast: false matrix: @@ -56,11 +61,6 @@ jobs: - name: Download corpus from R2 shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${{ matrix.fuzz_target }}_corpus.tar.zst" CORPUS_DIR="fuzz/corpus/${{ matrix.fuzz_target }}" diff --git a/.github/workflows/minimize_fuzz_corpus_workflow.yml b/.github/workflows/minimize_fuzz_corpus_workflow.yml index 784d1e059d1..47475c8e8f9 100644 --- a/.github/workflows/minimize_fuzz_corpus_workflow.yml +++ b/.github/workflows/minimize_fuzz_corpus_workflow.yml @@ -34,6 +34,11 @@ env: jobs: minimize: name: "Minimize ${{ inputs.fuzz_target }}" + env: + AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" runs-on: >- ${{ github.repository == 'vortex-data/vortex' && format('runs-on={0}/runner=arm64-medium/disk=large/tag={1}-minimize', github.run_id, inputs.fuzz_target) @@ -64,11 +69,6 @@ jobs: - name: Restore corpus shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst" CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}" @@ -98,11 +98,6 @@ jobs: - name: Persist corpus shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst" CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}" diff --git a/.github/workflows/run-fuzzer.yml b/.github/workflows/run-fuzzer.yml index 82f0dd0ace4..de1f1e0bf3b 100644 --- a/.github/workflows/run-fuzzer.yml +++ b/.github/workflows/run-fuzzer.yml @@ -61,6 +61,10 @@ jobs: name: "Run ${{ inputs.fuzz_name || inputs.fuzz_target }}" env: FUZZ_NAME: ${{ inputs.fuzz_name || inputs.fuzz_target }} + AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" timeout-minutes: 230 # almost 4 hours runs-on: >- ${{ github.repository == 'vortex-data/vortex' @@ -95,14 +99,9 @@ jobs: - name: Restore corpus shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${FUZZ_NAME}_corpus.tar.zst" - CORPUS_DIR="fuzz/corpus/${FUZZ_NAME}-${{ inputs.extra_features }}" + CORPUS_DIR="fuzz/corpus/${FUZZ_NAME}" # Try to download corpus if python3 scripts/s3-download.py "s3://vortex-fuzz-corpus/$CORPUS_KEY" "$CORPUS_KEY"; then @@ -189,11 +188,6 @@ jobs: - name: Persist corpus shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" run: | CORPUS_KEY="${FUZZ_NAME}_corpus.tar.zst" CORPUS_DIR="fuzz/corpus/${FUZZ_NAME}" diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index c12a5fe7bea..2abc1f9be70 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -14,7 +14,6 @@ on: required: false type: string description: "JSON string containing the matrix configuration" - # We do not include lance in the default configuration. default: | [ { @@ -28,7 +27,8 @@ on: "subcommand": "tpch", "name": "TPC-H SF=1 on NVME", "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "1.0" + "scale_factor": "1.0", + "iterations": "10" }, { "id": "tpch-s3", @@ -45,7 +45,8 @@ on: "subcommand": "tpch", "name": "TPC-H SF=10 on NVME", "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", - "scale_factor": "10.0" + "scale_factor": "10.0", + "iterations": "10" }, { "id": "tpch-s3-10", @@ -68,6 +69,7 @@ on: "id": "statpopgen", "subcommand": "statpopgen", "name": "Statistical and Population Genetics", + "local_dir": "vortex-bench/data/statpopgen", "targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "100" }, @@ -85,8 +87,7 @@ on: "local_dir": "vortex-bench/data/fineweb", "remote_storage": "s3://vortex-ci-benchmark-datasets/${{github.ref_name}}/${{github.run_id}}/fineweb/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", - "scale_factor": "100", - "iterations": "10" + "scale_factor": "100" }, { "id": "polarsignals", @@ -135,13 +136,32 @@ jobs: - uses: ./.github/actions/system-info + - name: Resolve targets + id: resolve + shell: bash + run: | + targets="${{ matrix.targets }}" + # Non-PR modes include additional targets + if [ "${{ inputs.mode }}" != "pr" ]; then + case "${{ matrix.subcommand }}" in + clickbench) targets="$targets,datafusion:vortex-compact,duckdb:vortex-compact" ;; + esac + # Lance comparisons for local clickbench/tpch (not S3) + if [ -z "${{ matrix.remote_storage }}" ]; then + case "${{ matrix.subcommand }}" in + clickbench|tpch) targets="$targets,datafusion:lance" ;; + esac + fi + fi + echo "targets=$targets" >> $GITHUB_OUTPUT + - name: Build binaries shell: bash env: RUSTFLAGS: "-C target-cpu=native" run: | packages="--bin data-gen --bin datafusion-bench --bin duckdb-bench" - if [ "${{ matrix.build_lance }}" = "true" ]; then + if [ "${{ inputs.mode }}" != "pr" ]; then packages="$packages --bin lance-bench" fi cargo build $packages --profile release_debug --features unstable_encodings @@ -152,7 +172,7 @@ jobs: RUST_BACKTRACE: full run: | # Extract all unique formats from targets (e.g., "datafusion:parquet,duckdb:vortex" -> "parquet,vortex") - all_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | sed 's/^[^:]*://' | sort -u | tr '\n' ',' | sed 's/,$//') + all_formats=$(echo "${{ steps.resolve.outputs.targets }}" | tr ',' '\n' | sed 's/^[^:]*://' | sort -u | tr '\n' ',' | sed 's/,$//') # Append extra data formats if specified (for file size tracking without benchmarking) if [ -n "${{ matrix.extra_data_formats }}" ]; then @@ -206,7 +226,7 @@ jobs: OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}" OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}" run: | - bash scripts/bench-taskset.sh .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \ + bash scripts/bench-taskset.sh .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ steps.resolve.outputs.targets }}" \ ${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \ ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }} @@ -221,7 +241,7 @@ jobs: OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}" OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}" run: | - bash scripts/bench-taskset.sh .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \ + bash scripts/bench-taskset.sh .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ steps.resolve.outputs.targets }}" \ ${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \ --remote-storage "${{ matrix.remote_storage }}" \ --benchmark-id "${{ matrix.id }}" \ diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index d24b6a1ec6b..04d44fec222 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -7,6 +7,7 @@ on: jobs: close-issues: runs-on: ubuntu-latest + timeout-minutes: 10 permissions: pull-requests: write steps: @@ -17,7 +18,7 @@ jobs: # PR has 7 more days to become active, otherwise it will be closed. days-before-pr-close: 7 stale-pr-label: "stale" - stale-pr-message: "This PR has been marked as stale because it has been open for 30 days with no activity. Please comment or remove the stale label if you wish to keep it active, otherwise it will be closed in 7 days" + stale-pr-message: "This PR has been marked as stale because it has been open for 14 days with no activity. Please comment or remove the stale label if you wish to keep it active, otherwise it will be closed in 7 days" close-pr-message: "This PR was closed because it has been inactive for 7 days since being marked as stale." days-before-issue-stale: -1 days-before-issue-close: -1