From b6cc53a200ba567e71f0f747626c7f470a7e1fce Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 19 Jun 2026 21:38:01 -0700 Subject: [PATCH 1/2] chore: add optional CI flow for parquet writes --- .github/workflows/spark_sql_writer_tests.yml | 163 +++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 .github/workflows/spark_sql_writer_tests.yml diff --git a/.github/workflows/spark_sql_writer_tests.yml b/.github/workflows/spark_sql_writer_tests.yml new file mode 100644 index 0000000000..e10a81c2ce --- /dev/null +++ b/.github/workflows/spark_sql_writer_tests.yml @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Manual workflow that exercises Spark's Parquet WRITER test suites with +# Comet's experimental native Parquet writer enabled (ENABLE_COMET_WRITE=true, +# spark.comet.parquet.write.enabled). Self-contained — does not call +# spark_sql_test_reusable.yml — so it can run on feature branches without +# touching the standard CI matrix. +# +# Trigger: workflow_dispatch only. Pick a Spark minor version from the +# selector and click "Run workflow". +# +# Suite list focuses on writer-side coverage: Parquet IO, committer, +# encoding, compression codec, V1/V2 file format, V1/V2 query, V1/V2 +# partition discovery, field IDs, schema; plus the generic writer +# infrastructure (FileFormatWriterSuite, PartitionedWriteSuite). Reader-only +# Parquet suites (ParquetV{1,2}FilterSuite, *SchemaPruningSuite, etc.) are +# intentionally excluded — they're already covered by the sql_core-* lanes. + +name: Spark SQL Writer Tests (manual) + +on: + workflow_dispatch: + inputs: + spark-version: + description: 'Spark minor version to test against' + type: choice + required: true + default: '3.5' + options: + - '3.4' + - '3.5' + - '4.0' + - '4.1' + +env: + RUST_VERSION: stable + RUST_BACKTRACE: 1 + # Force GNU ld on Linux: recent Rust stable defaults to rust-lld on + # x86_64-unknown-linux-gnu, and rust-lld cannot resolve -ljvm against the + # Zulu JDK layout installed by setup-java. Keep bfd for all cargo invocations. + RUSTFLAGS: "-Clink-arg=-fuse-ld=bfd" + +jobs: + writer-test: + name: spark-sql-writer/spark-${{ inputs.spark-version }} + runs-on: ubuntu-24.04 + container: + image: amd64/rust + steps: + - uses: actions/checkout@v7 + + - name: Resolve Spark full version and JDK + id: resolve + shell: bash + run: | + # Map each supported Spark minor version to its full version + JDK. + # Mirrors ci.yml's per-version reusable invocations. + case "${{ inputs.spark-version }}" in + 3.4) spark_full=3.4.3; java=11 ;; + 3.5) spark_full=3.5.8; java=17 ;; + 4.0) spark_full=4.0.2; java=17 ;; + 4.1) spark_full=4.1.2; java=17 ;; + *) echo "Unsupported spark-version: ${{ inputs.spark-version }}" >&2; exit 1 ;; + esac + echo "spark-full=$spark_full" >> "$GITHUB_OUTPUT" + echo "java=$java" >> "$GITHUB_OUTPUT" + + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.RUST_VERSION }} + jdk-version: ${{ steps.resolve.outputs.java }} + + - name: Restore Cargo cache + uses: actions/cache/restore@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + native/target + key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }} + restore-keys: | + ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}- + + - name: Build native library (CI profile) + run: | + cd native + cargo build --profile ci + env: + RUSTFLAGS: "-Ctarget-cpu=x86-64-v3 -Clink-arg=-fuse-ld=bfd" + + - name: Stage native library at release path + run: | + # setup-spark-builder's `mvnw install -DskipTests` (skip-native-build + # path) bundles native/target/release/libcomet.so into the Comet JAR. + # We built with --profile ci to avoid LTO, so the file lives at + # native/target/ci/. Copy it to where the Maven build expects it. + mkdir -p native/target/release + cp native/target/ci/libcomet.so native/target/release/libcomet.so + + - name: Setup Spark + uses: ./.github/actions/setup-spark-builder + with: + spark-version: ${{ steps.resolve.outputs.spark-full }} + spark-short-version: ${{ inputs.spark-version }} + skip-native-build: true + + - name: Run Parquet writer tests + run: | + cd apache-spark + rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups + # SERIAL_SBT_TESTS gates SparkParallelTestGrouping in + # project/SparkBuild.scala. None of the (version, JDK) combinations + # offered here are 4.0 + JDK 21 (the file-stream-leak case the + # reusable workflow handles via DEDICATED_JVM_SBT_TESTS), so we + # always set it to reduce peak memory on standard 7 GB runners. + export SERIAL_SBT_TESTS=1 + # Same forked-test-JVM caps as sql_core-* in spark_sql_test_reusable.yml. + export HEAP_SIZE=3g + export METASPACE_SIZE=1g + NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_WRITE=true \ + build/sbt -Dsbt.log.noformat=true -mem $SBT_MEM \ + 'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \ + "sql/testOnly org.apache.spark.sql.execution.datasources.parquet.ParquetIOSuite org.apache.spark.sql.execution.datasources.parquet.ParquetCommitterSuite org.apache.spark.sql.execution.datasources.parquet.ParquetEncodingSuite org.apache.spark.sql.execution.datasources.parquet.ParquetCompressionCodecPrecedenceSuite org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite org.apache.spark.sql.execution.datasources.parquet.ParquetV1QuerySuite org.apache.spark.sql.execution.datasources.parquet.ParquetV2QuerySuite org.apache.spark.sql.execution.datasources.parquet.ParquetV1PartitionDiscoverySuite org.apache.spark.sql.execution.datasources.parquet.ParquetV2PartitionDiscoverySuite org.apache.spark.sql.execution.datasources.parquet.ParquetFieldIdIOSuite org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaSuite org.apache.spark.sql.execution.datasources.FileFormatWriterSuite org.apache.spark.sql.sources.PartitionedWriteSuite" + env: + LC_ALL: "C.UTF-8" + # Cap SBT orchestrator heap so the freed RAM goes to the forked test + # JVM and OS/container overhead, fixing cgroup-OOM SIGKILLs under + # 7 GB runners. + SBT_MEM: "1024" + # G1GC + tuning for the SBT orchestrator JVM. -Xss4m replaces the + # launcher's -Xss64m default (no compile here, deep recursion not + # needed). UseStringDeduplication and MaxMetaspaceSize cap real and + # ceiling footprint. ExitOnOutOfMemoryError fails fast. + SBT_OPTS: >- + -Xss4m + -XX:+UseG1GC + -XX:+UseStringDeduplication + -XX:MaxMetaspaceSize=384m + -XX:G1HeapRegionSize=2m + -XX:InitiatingHeapOccupancyPercent=35 + -XX:+ParallelRefProcEnabled + -XX:+ExitOnOutOfMemoryError + # On Spark 4.0, process-isolate the V1/V2 Parquet and Orc source + # suites because they leak file streams across suites under newer + # JDKs. project/SparkBuild.scala reads DEDICATED_JVM_SBT_TESTS and + # forks a separate JVM per listed suite. Empty value is a safe no-op. + DEDICATED_JVM_SBT_TESTS: ${{ inputs.spark-version == '4.0' && 'org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite' || '' }} From 6d60e3019dd519857aa7cc9d5e2181bb7d31dd9a Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 19 Jun 2026 21:40:58 -0700 Subject: [PATCH 2/2] chore: add optional CI flow for parquet writes --- .github/workflows/spark_sql_writer_tests.yml | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/.github/workflows/spark_sql_writer_tests.yml b/.github/workflows/spark_sql_writer_tests.yml index e10a81c2ce..515c3b306d 100644 --- a/.github/workflows/spark_sql_writer_tests.yml +++ b/.github/workflows/spark_sql_writer_tests.yml @@ -42,9 +42,7 @@ on: required: true default: '3.5' options: - - '3.4' - '3.5' - - '4.0' - '4.1' env: @@ -69,11 +67,10 @@ jobs: shell: bash run: | # Map each supported Spark minor version to its full version + JDK. - # Mirrors ci.yml's per-version reusable invocations. + # Mirrors ci.yml's per-version reusable invocations (default-on PR + # versions only; 3.4 and 4.0 are label-gated and not offered here). case "${{ inputs.spark-version }}" in - 3.4) spark_full=3.4.3; java=11 ;; 3.5) spark_full=3.5.8; java=17 ;; - 4.0) spark_full=4.0.2; java=17 ;; 4.1) spark_full=4.1.2; java=17 ;; *) echo "Unsupported spark-version: ${{ inputs.spark-version }}" >&2; exit 1 ;; esac @@ -125,10 +122,9 @@ jobs: cd apache-spark rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups # SERIAL_SBT_TESTS gates SparkParallelTestGrouping in - # project/SparkBuild.scala. None of the (version, JDK) combinations - # offered here are 4.0 + JDK 21 (the file-stream-leak case the - # reusable workflow handles via DEDICATED_JVM_SBT_TESTS), so we - # always set it to reduce peak memory on standard 7 GB runners. + # project/SparkBuild.scala. We always set it to reduce peak memory + # on standard 7 GB runners (3.5 and 4.1 are unaffected by the + # 4.0+JDK 21 file-stream-leak case the reusable workflow handles). export SERIAL_SBT_TESTS=1 # Same forked-test-JVM caps as sql_core-* in spark_sql_test_reusable.yml. export HEAP_SIZE=3g @@ -156,8 +152,3 @@ jobs: -XX:InitiatingHeapOccupancyPercent=35 -XX:+ParallelRefProcEnabled -XX:+ExitOnOutOfMemoryError - # On Spark 4.0, process-isolate the V1/V2 Parquet and Orc source - # suites because they leak file streams across suites under newer - # JDKs. project/SparkBuild.scala reads DEDICATED_JVM_SBT_TESTS and - # forks a separate JVM per listed suite. Empty value is a safe no-op. - DEDICATED_JVM_SBT_TESTS: ${{ inputs.spark-version == '4.0' && 'org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite' || '' }}