diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 0000000000..8a8791e711 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,113 @@ +# GitHub Workflows + +GitHub Actions only loads `*.yml` / `*.yaml` files in this directory as +workflows. This README is ignored by the runner. + +## Pipeline overview + +A single umbrella workflow (`ci.yml`) orchestrates everything that runs on +pull requests and pushes to `main`. The umbrella runs cheap **preflight** +checks first, computes which heavy jobs are relevant to the change, and only +then fans out to the long-running test/build workflows. Each long workflow +is a `workflow_call` reusable invoked from the umbrella. + +``` + pull_request | push to main | workflow_dispatch + | + v + +-----------------------+ + | preflight | ubuntu-slim + | (RAT, prettier, | + | missing-suites, | + | actionlint) | + +-----------+-----------+ + | on success + v + +-----------------------+ + | changes | ubuntu-slim + | (compute-changes.py: | + | one boolean per | + | heavy job) | + +-----------+-----------+ + | + +-----------+-----------+-----------+-----------+-----------+-----------+ + | | | | | | | + v v v v v v v + pr_build_ pr_build_ pr_benchmark_ docs spark_3_5 spark_4_0 iceberg_1_10 + linux macos check (push) (PR+push) (PR+push) (PR+push) + (PR+push) (PR+push) (PR+push) + | | | + v v v + spark_3_4 / spark_4_1 iceberg_1_8 / 1_9 + (push or PR + label) (push only) + + reusable workflows invoked via `uses:`: + pr_build_linux.yml spark_sql_test_reusable.yml + pr_build_macos.yml iceberg_spark_test_reusable.yml + pr_benchmark_check.yml + docs.yaml +``` + +## What runs when + +| Job in `ci.yml` | Triggered by | Path filter source | +| -------------------- | ------------------------------------------------ | ----------------------------------- | +| `preflight` | every PR / push to main / dispatch | none (always runs) | +| `changes` | every PR / push to main / dispatch | runs `dev/ci/compute-changes.py` | +| `pr_build_linux` | PR or push, paths matched | `dev/ci/compute-changes.py` | +| `pr_build_macos` | PR or push, paths matched | `dev/ci/compute-changes.py` | +| `pr_benchmark_check` | PR or push, paths matched | benchmark sources only | +| `docs` | push to main, paths matched | `.asf.yaml`, `docs/**`, `docs.yaml` | +| `spark_3_5` | PR or push, paths matched | Spark 3.5 sources | +| `spark_4_0` | PR or push, paths matched | Spark 4.0 sources | +| `spark_3_4` | push, **or** PR with `run-spark-3.4-tests` label | Spark 3.4 sources | +| `spark_4_1` | push, **or** PR with `run-spark-4.1-tests` label | Spark 4.1 sources | +| `iceberg_1_10` | PR or push, paths matched | Iceberg sources | +| `iceberg_1_8` | push only | Iceberg sources | +| `iceberg_1_9` | push only | Iceberg sources | + +A heavy job appears in the PR's checks list as a `skipped` entry whenever +its path filter or event criteria don't match. Skipped checks count as +passing for branch protection. + +## Standalone workflows (not under the umbrella) + +These workflows have their own triggers because they fire on events the +umbrella doesn't watch, or operate independently of the rest of CI: + +| File | Why standalone | +| ---------------------- | ---------------------------------------------------------------------------------------------------- | +| `pr_title_check.yml` | Fires on `pull_request.types: [edited]` so it re-runs when a PR title is edited without a code push. | +| `codeql.yml` | Security scanner; weekly schedule + on every push/PR. | +| `miri.yml` | Nightly Miri safety checks. | +| `stale.yml` | Daily stale-PR closer. | +| `take.yml` | Issue-comment trigger for `take` / `untake`. | +| `label_new_issues.yml` | Issue trigger to apply `requires-triage`. | + +## Reusable workflows (called by `ci.yml`) + +| File | Called from `ci.yml` job(s) | +| --------------------------------- | -------------------------------------------------- | +| `pr_build_linux.yml` | `pr_build_linux` | +| `pr_build_macos.yml` | `pr_build_macos` | +| `pr_benchmark_check.yml` | `pr_benchmark_check` | +| `docs.yaml` | `docs` | +| `spark_sql_test_reusable.yml` | `spark_3_4`, `spark_3_5`, `spark_4_0`, `spark_4_1` | +| `iceberg_spark_test_reusable.yml` | `iceberg_1_8`, `iceberg_1_9`, `iceberg_1_10` | + +## Modifying path filters + +Each long workflow's "what files trigger me" rules live in the `FILTERS` +dict at the top of `dev/ci/compute-changes.py`. The `changes` job in +`ci.yml` invokes that script and the gate `if:` on each long job consumes +`needs.changes.outputs.`. When adding a new test suite or moving +sources, update the relevant filter entry there. + +## Branch protection + +Required-check names changed when these workflows were consolidated. The +umbrella exposes per-job names like `CI / pr_build_linux / Lint`, +`CI / spark_3_5 / linux-test (...)`, etc. Update repository branch +protection rules to point at the new names; the old standalone workflow +names (`Spark SQL Tests (Spark 3.5)`, `PR Build (Linux)`, ...) no longer +exist as top-level workflows. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..e96f40461d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,292 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Top-level CI orchestrator: runs cheap preflight checks first, then fans out +# to the long-running test/build workflows only if preflight passed and the +# PR/push touched files relevant to that workflow. + +name: CI + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +on: + pull_request: + types: [opened, synchronize, reopened, labeled] + push: + branches: + - main + workflow_dispatch: + +jobs: + # --------------------------------------------------------------------------- + # preflight: cheap checks that gate everything else. Failure short-circuits + # the entire pipeline before any heavy job spins up. Folds in what used to be + # pr_rat_check, pr_markdown_format, pr_missing_suites, and validate_workflows. + # pr_title_check stays a standalone workflow because it needs to fire on PR + # `edited` events. + # --------------------------------------------------------------------------- + preflight: + name: Preflight + runs-on: ubuntu-slim + steps: + - uses: actions/checkout@v6 + + - name: Set up Java + uses: actions/setup-java@v5 + with: + distribution: temurin + java-version: 11 + + - name: Apache RAT license check + run: ./mvnw -B -N apache-rat:check + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: '24' + + - name: Install prettier + run: npm install -g prettier + + - name: Check markdown formatting + run: prettier --check "**/*.md" + + - name: Check missing suites + run: python3 dev/ci/check-suites.py + + - name: Install actionlint + run: | + curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash | bash + echo "$PWD" >> $GITHUB_PATH + + - name: Lint GitHub Actions workflows + run: actionlint -color --shellcheck=off + + # --------------------------------------------------------------------------- + # changes: compute which long jobs need to run for this event. Replaces the + # per-workflow `on: paths:` filters that used to gate triggering. Filter + # rules live in dev/ci/compute-changes.py, which is invoked here in lieu of + # dorny/paths-filter (not on the apache org actions allow list). On + # workflow_dispatch every output is forced true so a manual run can + # exercise any gated job. + # --------------------------------------------------------------------------- + changes: + name: Detect changes + needs: preflight + runs-on: ubuntu-slim + outputs: + build_linux: ${{ steps.compute.outputs.build_linux }} + build_macos: ${{ steps.compute.outputs.build_macos }} + benchmark: ${{ steps.compute.outputs.benchmark }} + docs: ${{ steps.compute.outputs.docs }} + spark_3_4: ${{ steps.compute.outputs.spark_3_4 }} + spark_3_5: ${{ steps.compute.outputs.spark_3_5 }} + spark_4_0: ${{ steps.compute.outputs.spark_4_0 }} + spark_4_1: ${{ steps.compute.outputs.spark_4_1 }} + iceberg_1_8: ${{ steps.compute.outputs.iceberg_1_8 }} + iceberg_1_9: ${{ steps.compute.outputs.iceberg_1_9 }} + iceberg_1_10: ${{ steps.compute.outputs.iceberg_1_10 }} + steps: + - uses: actions/checkout@v6 + with: + # Need both branches' history so we can diff base..head for PRs and + # before..after for pushes. + fetch-depth: 0 + + - name: Compute outputs + id: compute + shell: bash + env: + EVENT_NAME: ${{ github.event_name }} + PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} + PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PUSH_BEFORE: ${{ github.event.before }} + PUSH_AFTER: ${{ github.sha }} + run: | + set -euo pipefail + if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then + for key in build_linux build_macos benchmark docs spark_3_4 spark_3_5 spark_4_0 spark_4_1 iceberg_1_8 iceberg_1_9 iceberg_1_10; do + echo "${key}=true" >> "$GITHUB_OUTPUT" + done + exit 0 + fi + if [[ "$EVENT_NAME" == "pull_request" ]]; then + git diff --name-only "$PR_BASE_SHA"..."$PR_HEAD_SHA" > changed_files.txt + else + # push to main; first push to a branch has all-zero before sha + if [[ "$PUSH_BEFORE" =~ ^0+$ ]]; then + git ls-tree -r --name-only "$PUSH_AFTER" > changed_files.txt + else + git diff --name-only "$PUSH_BEFORE".."$PUSH_AFTER" > changed_files.txt + fi + fi + echo "Changed files:" + cat changed_files.txt + python3 dev/ci/compute-changes.py changed_files.txt >> "$GITHUB_OUTPUT" + + # --------------------------------------------------------------------------- + # Heavy jobs: each is a thin caller of an existing reusable workflow. The + # `if:` expressions encode the same event/label/path criteria the + # standalone trigger workflows used to encode in their `on:` blocks. + # --------------------------------------------------------------------------- + + pr_build_linux: + name: PR Build (Linux) + needs: changes + if: | + needs.changes.outputs.build_linux == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request') + uses: ./.github/workflows/pr_build_linux.yml + + pr_build_macos: + name: PR Build (macOS) + needs: changes + if: | + needs.changes.outputs.build_macos == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request') + uses: ./.github/workflows/pr_build_macos.yml + + pr_benchmark_check: + name: PR Benchmark Check + needs: changes + if: | + needs.changes.outputs.benchmark == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request') + uses: ./.github/workflows/pr_benchmark_check.yml + + docs: + name: Deploy Comet site + needs: changes + # docs deploys to asf-site, so only run on push-to-main (or a manual dispatch). + if: | + needs.changes.outputs.docs == 'true' && + (github.event_name == 'push' || github.event_name == 'workflow_dispatch') + uses: ./.github/workflows/docs.yaml + + spark_3_4: + name: Spark SQL Tests (Spark 3.4) + needs: changes + # Main-only by default; PRs need the `run-spark-3.4-tests` label. + if: | + needs.changes.outputs.spark_3_4 == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'run-spark-3.4-tests'))) + uses: ./.github/workflows/spark_sql_test_reusable.yml + with: + spark-short: '3.4' + spark-full: '3.4.3' + java: 11 + + spark_3_5: + name: Spark SQL Tests (Spark 3.5) + needs: changes + if: | + needs.changes.outputs.spark_3_5 == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request') + uses: ./.github/workflows/spark_sql_test_reusable.yml + with: + spark-short: '3.5' + spark-full: '3.5.8' + java: 17 + + spark_4_0: + name: Spark SQL Tests (Spark 4.0) + needs: changes + if: | + needs.changes.outputs.spark_4_0 == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request') + uses: ./.github/workflows/spark_sql_test_reusable.yml + with: + spark-short: '4.0' + spark-full: '4.0.2' + java: 17 + + spark_4_1: + name: Spark SQL Tests (Spark 4.1) + needs: changes + # Main-only by default; PRs need the `run-spark-4.1-tests` label. + if: | + needs.changes.outputs.spark_4_1 == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'run-spark-4.1-tests'))) + uses: ./.github/workflows/spark_sql_test_reusable.yml + with: + spark-short: '4.1' + spark-full: '4.1.1' + java: 17 + + iceberg_1_8: + name: Iceberg Spark SQL Tests (Iceberg 1.8) + needs: changes + # Main-only; never runs on PR events. + if: | + needs.changes.outputs.iceberg_1_8 == 'true' && + (github.event_name == 'push' || github.event_name == 'workflow_dispatch') + uses: ./.github/workflows/iceberg_spark_test_reusable.yml + with: + iceberg-short: '1.8' + iceberg-full: '1.8.1' + spark-short: '3.4' + spark-full: '3.4.3' + java: 11 + + iceberg_1_9: + name: Iceberg Spark SQL Tests (Iceberg 1.9) + needs: changes + # Main-only; never runs on PR events. + if: | + needs.changes.outputs.iceberg_1_9 == 'true' && + (github.event_name == 'push' || github.event_name == 'workflow_dispatch') + uses: ./.github/workflows/iceberg_spark_test_reusable.yml + with: + iceberg-short: '1.9' + iceberg-full: '1.9.1' + spark-short: '3.5' + spark-full: '3.5.8' + java: 17 + + iceberg_1_10: + name: Iceberg Spark SQL Tests (Iceberg 1.10) + needs: changes + if: | + needs.changes.outputs.iceberg_1_10 == 'true' && + (github.event_name == 'push' || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request') + uses: ./.github/workflows/iceberg_spark_test_reusable.yml + with: + iceberg-short: '1.10' + iceberg-full: '1.10.0' + spark-short: '3.5' + spark-full: '3.5.8' + java: 17 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 189cad6826..6f607d278c 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -15,17 +15,13 @@ # specific language governing permissions and limitations # under the License. -on: - push: - branches: - - main - paths: - - .asf.yaml - - .github/workflows/docs.yaml - - docs/** - name: Deploy DataFusion Comet site +# Reusable: invoked by ci.yml. Triggering and path filters live in the +# umbrella workflow. +on: + workflow_call: + jobs: build-docs: name: Build docs diff --git a/.github/workflows/iceberg_spark_test_1_10.yml b/.github/workflows/iceberg_spark_test_1_10.yml deleted file mode 100644 index 8df8a22c74..0000000000 --- a/.github/workflows/iceberg_spark_test_1_10.yml +++ /dev/null @@ -1,77 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on every PR and on main. Latest supported Iceberg version, paired with -# Spark 3.5. -name: Iceberg Spark SQL Tests (Iceberg 1.10) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/iceberg/**" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/iceberg_spark_test_1_10.yml" - - ".github/workflows/iceberg_spark_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-iceberg-builder/**" - pull_request: - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/iceberg/**" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/iceberg_spark_test_1_10.yml" - - ".github/workflows/iceberg_spark_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-iceberg-builder/**" - workflow_dispatch: - -jobs: - iceberg-spark: - uses: ./.github/workflows/iceberg_spark_test_reusable.yml - with: - iceberg-short: '1.10' - iceberg-full: '1.10.0' - spark-short: '3.5' - spark-full: '3.5.8' - java: 17 diff --git a/.github/workflows/iceberg_spark_test_1_8.yml b/.github/workflows/iceberg_spark_test_1_8.yml deleted file mode 100644 index 111d1af46a..0000000000 --- a/.github/workflows/iceberg_spark_test_1_8.yml +++ /dev/null @@ -1,57 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on main only. Oldest supported Iceberg version, paired with Spark 3.4. -name: Iceberg Spark SQL Tests (Iceberg 1.8) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/iceberg/**" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/iceberg_spark_test_1_8.yml" - - ".github/workflows/iceberg_spark_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-iceberg-builder/**" - workflow_dispatch: - -jobs: - iceberg-spark: - uses: ./.github/workflows/iceberg_spark_test_reusable.yml - with: - iceberg-short: '1.8' - iceberg-full: '1.8.1' - spark-short: '3.4' - spark-full: '3.4.3' - java: 11 diff --git a/.github/workflows/iceberg_spark_test_1_9.yml b/.github/workflows/iceberg_spark_test_1_9.yml deleted file mode 100644 index f8a2361cf0..0000000000 --- a/.github/workflows/iceberg_spark_test_1_9.yml +++ /dev/null @@ -1,57 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on main only. Iceberg 1.9 paired with Spark 3.5. -name: Iceberg Spark SQL Tests (Iceberg 1.9) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/iceberg/**" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/iceberg_spark_test_1_9.yml" - - ".github/workflows/iceberg_spark_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-iceberg-builder/**" - workflow_dispatch: - -jobs: - iceberg-spark: - uses: ./.github/workflows/iceberg_spark_test_reusable.yml - with: - iceberg-short: '1.9' - iceberg-full: '1.9.1' - spark-short: '3.5' - spark-full: '3.5.8' - java: 17 diff --git a/.github/workflows/pr_benchmark_check.yml b/.github/workflows/pr_benchmark_check.yml index b07cc03c34..d7e4905d64 100644 --- a/.github/workflows/pr_benchmark_check.yml +++ b/.github/workflows/pr_benchmark_check.yml @@ -20,24 +20,10 @@ name: PR Benchmark Check -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - +# Reusable: invoked by ci.yml. Triggering, path filters, and concurrency +# live in the umbrella workflow. on: - push: - branches: - - main - paths: - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - pull_request: - paths: - - "native/core/benches/**" - - "native/spark-expr/benches/**" - - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" - workflow_dispatch: + workflow_call: env: RUST_VERSION: stable diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index 0e4988e368..422232f546 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -17,63 +17,10 @@ name: PR Build (Linux) -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - +# Reusable: invoked by ci.yml. Triggering, path filters, and concurrency +# live in the umbrella workflow. on: - # Allow-list of paths that affect this workflow. A change must match a positive - # pattern (and not a trailing "!" exclusion) for the build to run. Editing - # pr_build_macos.yml does not trigger this workflow, and vice versa. - push: - branches: - - main - paths: - - "native/**" - - "common/**" - - "spark/**" - - "spark-integration/**" - - "pom.xml" - - "**/pom.xml" - - ".mvn/**" - - "mvnw" - - "Makefile" - - "rust-toolchain.toml" - - "dev/ci/**" - - ".github/workflows/pr_build_linux.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/java-test/**" - - ".github/actions/rust-test/**" - - "!**.md" - - "!native/core/benches/**" - - "!native/spark-expr/benches/**" - - "!spark/src/test/scala/org/apache/spark/sql/benchmark/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - pull_request: - paths: - - "native/**" - - "common/**" - - "spark/**" - - "spark-integration/**" - - "pom.xml" - - "**/pom.xml" - - ".mvn/**" - - "mvnw" - - "Makefile" - - "rust-toolchain.toml" - - "dev/ci/**" - - ".github/workflows/pr_build_linux.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/java-test/**" - - ".github/actions/rust-test/**" - - "!**.md" - - "!native/core/benches/**" - - "!native/spark-expr/benches/**" - - "!spark/src/test/scala/org/apache/spark/sql/benchmark/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - # manual trigger - # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow - workflow_dispatch: + workflow_call: env: RUST_VERSION: stable diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index 5101f5290c..d0a03eeb75 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -17,61 +17,10 @@ name: PR Build (macOS) -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - +# Reusable: invoked by ci.yml. Triggering, path filters, and concurrency +# live in the umbrella workflow. on: - # Allow-list of paths that affect this workflow. A change must match a positive - # pattern (and not a trailing "!" exclusion) for the build to run. Editing - # pr_build_linux.yml does not trigger this workflow, and vice versa. - push: - branches: - - main - paths: - - "native/**" - - "common/**" - - "spark/**" - - "spark-integration/**" - - "pom.xml" - - "**/pom.xml" - - ".mvn/**" - - "mvnw" - - "Makefile" - - "rust-toolchain.toml" - - "dev/ci/**" - - ".github/workflows/pr_build_macos.yml" - - ".github/actions/setup-macos-builder/**" - - ".github/actions/java-test/**" - - "!**.md" - - "!native/core/benches/**" - - "!native/spark-expr/benches/**" - - "!spark/src/test/scala/org/apache/spark/sql/benchmark/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - pull_request: - paths: - - "native/**" - - "common/**" - - "spark/**" - - "spark-integration/**" - - "pom.xml" - - "**/pom.xml" - - ".mvn/**" - - "mvnw" - - "Makefile" - - "rust-toolchain.toml" - - "dev/ci/**" - - ".github/workflows/pr_build_macos.yml" - - ".github/actions/setup-macos-builder/**" - - ".github/actions/java-test/**" - - "!**.md" - - "!native/core/benches/**" - - "!native/spark-expr/benches/**" - - "!spark/src/test/scala/org/apache/spark/sql/benchmark/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - # manual trigger - # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow - workflow_dispatch: + workflow_call: env: RUST_VERSION: stable diff --git a/.github/workflows/pr_markdown_format.yml b/.github/workflows/pr_markdown_format.yml deleted file mode 100644 index 5239e9083a..0000000000 --- a/.github/workflows/pr_markdown_format.yml +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Check Markdown Formatting - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - pull_request: - paths: - - '**.md' - -jobs: - prettier-check: - runs-on: ubuntu-slim - steps: - - uses: actions/checkout@v6 - - - name: Setup Node.js - uses: actions/setup-node@v6 - with: - node-version: '24' - - - name: Install prettier - run: npm install -g prettier - - - name: Check markdown formatting - run: | - # if you encounter error, run prettier locally and commit changes using instructions at: - # - # https://datafusion.apache.org/comet/contributor-guide/development.html#how-to-format-md-document - # - prettier --check "**/*.md" diff --git a/.github/workflows/pr_missing_suites.yml b/.github/workflows/pr_missing_suites.yml deleted file mode 100644 index 3d42392549..0000000000 --- a/.github/workflows/pr_missing_suites.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Check that all test suites are added to PR workflows - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] - -jobs: - check-missing-suites: - runs-on: ubuntu-slim - steps: - - uses: actions/checkout@v6 - - name: Check Missing Suites - run: python3 dev/ci/check-suites.py diff --git a/.github/workflows/pr_rat_check.yml b/.github/workflows/pr_rat_check.yml deleted file mode 100644 index 5c2352ee3b..0000000000 --- a/.github/workflows/pr_rat_check.yml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: RAT License Check - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -permissions: - contents: read - -# No paths-ignore: this workflow must run for ALL changes including docs -on: - push: - branches: - - main - pull_request: - workflow_dispatch: - -jobs: - rat-check: - name: RAT License Check - runs-on: ubuntu-slim - steps: - - uses: actions/checkout@v6 - - name: Set up Java - uses: actions/setup-java@v5 - with: - distribution: temurin - java-version: 11 - - name: Run RAT check - run: ./mvnw -B -N apache-rat:check diff --git a/.github/workflows/spark_sql_test_3_4.yml b/.github/workflows/spark_sql_test_3_4.yml deleted file mode 100644 index e894f0a5c1..0000000000 --- a/.github/workflows/spark_sql_test_3_4.yml +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on main only. Extra coverage for the oldest supported Spark. -name: Spark SQL Tests (Spark 3.4) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/spark-3.5/**" - - "!spark/src/main/spark-4.0/**" - - "!spark/src/main/spark-4.1/**" - - "!spark/src/main/spark-4.2/**" - - "!spark/src/main/spark-4.x/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/3.4.3.diff" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/spark_sql_test_3_4.yml" - - ".github/workflows/spark_sql_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-spark-builder/**" - # On-demand PR runs: a committer adds the `run-spark-3.4-tests` label - # and the workflow runs against the PR's merge ref. Works for forks. - pull_request: - types: [labeled] - workflow_dispatch: - inputs: - collect-fallback-logs: - description: 'Whether to collect Comet fallback reasons from spark sql unit test logs' - required: false - default: false - type: boolean - -jobs: - spark-sql: - if: github.event_name != 'pull_request' || github.event.label.name == 'run-spark-3.4-tests' - uses: ./.github/workflows/spark_sql_test_reusable.yml - with: - spark-short: '3.4' - spark-full: '3.4.3' - java: 11 - collect-fallback-logs: ${{ github.event.inputs.collect-fallback-logs == 'true' }} diff --git a/.github/workflows/spark_sql_test_3_5.yml b/.github/workflows/spark_sql_test_3_5.yml deleted file mode 100644 index 0bf656d81d..0000000000 --- a/.github/workflows/spark_sql_test_3_5.yml +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on every PR and on main. Spark 3.5 is the default supported version. -name: Spark SQL Tests (Spark 3.5) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/spark-3.4/**" - - "!spark/src/main/spark-4.0/**" - - "!spark/src/main/spark-4.1/**" - - "!spark/src/main/spark-4.2/**" - - "!spark/src/main/spark-4.x/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/3.5.8.diff" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/spark_sql_test_3_5.yml" - - ".github/workflows/spark_sql_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-spark-builder/**" - pull_request: - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/spark-3.4/**" - - "!spark/src/main/spark-4.0/**" - - "!spark/src/main/spark-4.1/**" - - "!spark/src/main/spark-4.2/**" - - "!spark/src/main/spark-4.x/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/3.5.8.diff" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/spark_sql_test_3_5.yml" - - ".github/workflows/spark_sql_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-spark-builder/**" - workflow_dispatch: - inputs: - collect-fallback-logs: - description: 'Whether to collect Comet fallback reasons from spark sql unit test logs' - required: false - default: false - type: boolean - -jobs: - spark-sql: - uses: ./.github/workflows/spark_sql_test_reusable.yml - with: - spark-short: '3.5' - spark-full: '3.5.8' - java: 11 - collect-fallback-logs: ${{ github.event.inputs.collect-fallback-logs == 'true' }} diff --git a/.github/workflows/spark_sql_test_4_0.yml b/.github/workflows/spark_sql_test_4_0.yml deleted file mode 100644 index d9d090ac47..0000000000 --- a/.github/workflows/spark_sql_test_4_0.yml +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on every PR and on main. Spark 4.0 is the newest stable Spark line. -name: Spark SQL Tests (Spark 4.0) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/spark-3.4/**" - - "!spark/src/main/spark-3.5/**" - - "!spark/src/main/spark-3.x/**" - - "!spark/src/main/spark-4.1/**" - - "!spark/src/main/spark-4.2/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/4.0.2.diff" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/spark_sql_test_4_0.yml" - - ".github/workflows/spark_sql_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-spark-builder/**" - pull_request: - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/spark-3.4/**" - - "!spark/src/main/spark-3.5/**" - - "!spark/src/main/spark-3.x/**" - - "!spark/src/main/spark-4.1/**" - - "!spark/src/main/spark-4.2/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/4.0.2.diff" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/spark_sql_test_4_0.yml" - - ".github/workflows/spark_sql_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-spark-builder/**" - workflow_dispatch: - inputs: - collect-fallback-logs: - description: 'Whether to collect Comet fallback reasons from spark sql unit test logs' - required: false - default: false - type: boolean - -jobs: - spark-sql: - uses: ./.github/workflows/spark_sql_test_reusable.yml - with: - spark-short: '4.0' - spark-full: '4.0.2' - java: 21 - collect-fallback-logs: ${{ github.event.inputs.collect-fallback-logs == 'true' }} diff --git a/.github/workflows/spark_sql_test_4_1.yml b/.github/workflows/spark_sql_test_4_1.yml deleted file mode 100644 index 33a6e8438c..0000000000 --- a/.github/workflows/spark_sql_test_4_1.yml +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Runs on main only. Forward-looking coverage for the in-development Spark line. -name: Spark SQL Tests (Spark 4.1) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - branches: - - main - paths: - - "native/**/src/**" - - "native/**/Cargo.toml" - - "native/Cargo.lock" - - "!native/hdfs/**" - - "!native/fs-hdfs/**" - - "common/src/main/**" - - "common/pom.xml" - - "spark/src/main/**" - - "!spark/src/main/spark-3.4/**" - - "!spark/src/main/spark-3.5/**" - - "!spark/src/main/spark-3.x/**" - - "!spark/src/main/spark-4.0/**" - - "!spark/src/main/spark-4.2/**" - - "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala" - - "spark/pom.xml" - - "dev/diffs/4.1.1.diff" - - "pom.xml" - - "rust-toolchain.toml" - - ".github/workflows/spark_sql_test_4_1.yml" - - ".github/workflows/spark_sql_test_reusable.yml" - - ".github/actions/setup-builder/**" - - ".github/actions/setup-spark-builder/**" - # On-demand PR runs: a committer adds the `run-spark-4.1-tests` label - # and the workflow runs against the PR's merge ref. Works for forks. - pull_request: - types: [labeled] - workflow_dispatch: - inputs: - collect-fallback-logs: - description: 'Whether to collect Comet fallback reasons from spark sql unit test logs' - required: false - default: false - type: boolean - -jobs: - spark-sql: - if: github.event_name != 'pull_request' || github.event.label.name == 'run-spark-4.1-tests' - uses: ./.github/workflows/spark_sql_test_reusable.yml - with: - spark-short: '4.1' - spark-full: '4.1.1' - java: 17 - collect-fallback-logs: ${{ github.event.inputs.collect-fallback-logs == 'true' }} diff --git a/.github/workflows/validate_workflows.yml b/.github/workflows/validate_workflows.yml deleted file mode 100644 index 7ddbf366da..0000000000 --- a/.github/workflows/validate_workflows.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Validate Github Workflows - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - pull_request: - paths: - - ".github/workflows/*.yml" - - ".github/workflows/*.yaml" - push: - branches: - - main - -jobs: - validate: - runs-on: ubuntu-slim - steps: - - uses: actions/checkout@v6 - - - name: Install actionlint - run: | - curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash | bash - echo "$PWD" >> $GITHUB_PATH - - - name: Lint GitHub Actions workflows - run: actionlint -color --shellcheck=off diff --git a/dev/ci/compute-changes.py b/dev/ci/compute-changes.py new file mode 100644 index 0000000000..c33016c1c6 --- /dev/null +++ b/dev/ci/compute-changes.py @@ -0,0 +1,284 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Replacement for dorny/paths-filter, which is not on the apache org allow +# list. Reads a list of changed files (one per line) and emits per-job +# "=true|false" lines suitable for $GITHUB_OUTPUT. Pattern semantics +# match dorny/picomatch: "**" spans path segments, "*" stays within a +# segment, and a leading "!" marks an exclude pattern. + +import re +import sys +from pathlib import Path + +FILTERS = { + "build_linux": [ + "native/**", + "common/**", + "spark/**", + "spark-integration/**", + "pom.xml", + "**/pom.xml", + ".mvn/**", + "mvnw", + "Makefile", + "rust-toolchain.toml", + "dev/ci/**", + ".github/workflows/ci.yml", + ".github/workflows/pr_build_linux.yml", + ".github/actions/setup-builder/**", + ".github/actions/java-test/**", + ".github/actions/rust-test/**", + "!**.md", + "!native/core/benches/**", + "!native/spark-expr/benches/**", + "!spark/src/test/scala/org/apache/spark/sql/benchmark/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + ], + "build_macos": [ + "native/**", + "common/**", + "spark/**", + "spark-integration/**", + "pom.xml", + "**/pom.xml", + ".mvn/**", + "mvnw", + "Makefile", + "rust-toolchain.toml", + "dev/ci/**", + ".github/workflows/ci.yml", + ".github/workflows/pr_build_macos.yml", + ".github/actions/setup-macos-builder/**", + ".github/actions/java-test/**", + "!**.md", + "!native/core/benches/**", + "!native/spark-expr/benches/**", + "!spark/src/test/scala/org/apache/spark/sql/benchmark/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + ], + "benchmark": [ + "native/core/benches/**", + "native/spark-expr/benches/**", + "spark/src/test/scala/org/apache/spark/sql/benchmark/**", + ], + "docs": [ + ".asf.yaml", + ".github/workflows/docs.yaml", + "docs/**", + ], + "spark_3_4": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/spark-3.5/**", + "!spark/src/main/spark-4.0/**", + "!spark/src/main/spark-4.1/**", + "!spark/src/main/spark-4.2/**", + "!spark/src/main/spark-4.x/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/3.4.3.diff", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/spark_sql_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-spark-builder/**", + ], + "spark_3_5": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/spark-3.4/**", + "!spark/src/main/spark-4.0/**", + "!spark/src/main/spark-4.1/**", + "!spark/src/main/spark-4.2/**", + "!spark/src/main/spark-4.x/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/3.5.8.diff", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/spark_sql_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-spark-builder/**", + ], + "spark_4_0": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/spark-3.4/**", + "!spark/src/main/spark-3.5/**", + "!spark/src/main/spark-3.x/**", + "!spark/src/main/spark-4.1/**", + "!spark/src/main/spark-4.2/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/4.0.2.diff", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/spark_sql_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-spark-builder/**", + ], + "spark_4_1": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/spark-3.4/**", + "!spark/src/main/spark-3.5/**", + "!spark/src/main/spark-3.x/**", + "!spark/src/main/spark-4.0/**", + "!spark/src/main/spark-4.2/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/4.1.1.diff", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/spark_sql_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-spark-builder/**", + ], + "iceberg_1_8": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/iceberg/**", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/iceberg_spark_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-iceberg-builder/**", + ], + "iceberg_1_9": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/iceberg/**", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/iceberg_spark_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-iceberg-builder/**", + ], + "iceberg_1_10": [ + "native/**/src/**", + "native/**/Cargo.toml", + "native/Cargo.lock", + "!native/hdfs/**", + "!native/fs-hdfs/**", + "common/src/main/**", + "common/pom.xml", + "spark/src/main/**", + "!spark/src/main/scala/org/apache/comet/GenerateDocs.scala", + "spark/pom.xml", + "dev/diffs/iceberg/**", + "pom.xml", + "rust-toolchain.toml", + ".github/workflows/ci.yml", + ".github/workflows/iceberg_spark_test_reusable.yml", + ".github/actions/setup-builder/**", + ".github/actions/setup-iceberg-builder/**", + ], +} + + +def glob_to_regex(pat): + # Translate a picomatch-style glob to a regex. "**/" at the start or + # interior is optional ("(?:.*/)?") so that "**/pom.xml" matches at the + # repo root; bare "**" is greedy across path separators. + out = [] + i = 0 + while i < len(pat): + c = pat[i] + if c == "*" and i + 1 < len(pat) and pat[i + 1] == "*": + if i + 2 < len(pat) and pat[i + 2] == "/": + out.append("(?:.*/)?") + i += 3 + else: + out.append(".*") + i += 2 + elif c == "*": + out.append("[^/]*") + i += 1 + elif c == "?": + out.append("[^/]") + i += 1 + elif c in r".+(){}[]^$|\\": + out.append("\\" + c) + i += 1 + else: + out.append(c) + i += 1 + return "^" + "".join(out) + "$" + + +def matches(patterns, files): + includes = [re.compile(glob_to_regex(p)) for p in patterns if not p.startswith("!")] + excludes = [re.compile(glob_to_regex(p[1:])) for p in patterns if p.startswith("!")] + for f in files: + if any(r.match(f) for r in includes) and not any(r.match(f) for r in excludes): + return True + return False + + +if __name__ == "__main__": + files_path = Path(sys.argv[1]) + files = [line.strip() for line in files_path.read_text().splitlines() if line.strip()] + for name, patterns in FILTERS.items(): + flag = "true" if matches(patterns, files) else "false" + print(f"{name}={flag}")