|
| 1 | +#!/bin/bash |
| 2 | +# |
| 3 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 4 | +# or more contributor license agreements. See the NOTICE file |
| 5 | +# distributed with this work for additional information |
| 6 | +# regarding copyright ownership. The ASF licenses this file |
| 7 | +# to you under the Apache License, Version 2.0 (the |
| 8 | +# "License"); you may not use this file except in compliance |
| 9 | +# with the License. You may obtain a copy of the License at |
| 10 | +# |
| 11 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | +# |
| 13 | +# Unless required by applicable law or agreed to in writing, |
| 14 | +# software distributed under the License is distributed on an |
| 15 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 16 | +# KIND, either express or implied. See the License for the |
| 17 | +# specific language governing permissions and limitations |
| 18 | +# under the License. |
| 19 | +# |
| 20 | + |
| 21 | +# Runs Apache Spark's SQL test suites locally with Comet enabled, reproducing |
| 22 | +# the spark_sql_test.yml GitHub Actions workflow for Spark 4.1. |
| 23 | +# |
| 24 | +# -e is intentionally not set: when running all module shards, one failing |
| 25 | +# shard must not stop the rest. Build and setup failures are checked |
| 26 | +# explicitly below. |
| 27 | + |
| 28 | +set -uo pipefail |
| 29 | + |
| 30 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 31 | +# shellcheck source=config.sh |
| 32 | +source "$SCRIPT_DIR/config.sh" |
| 33 | + |
| 34 | +usage() { |
| 35 | + cat <<EOF |
| 36 | +Usage: $(basename "$0") [module] |
| 37 | +
|
| 38 | +Run Apache Spark SQL test suites locally with Comet enabled (Spark $SPARK_VERSION). |
| 39 | +
|
| 40 | +Arguments: |
| 41 | + module One of: ${SPARK_SQL_MODULES[*]} |
| 42 | + or 'all' to run every shard sequentially (default). |
| 43 | +
|
| 44 | +Environment variables: |
| 45 | + SKIP_BUILD=1 Skip the Comet build; reuse existing artifacts. |
| 46 | + SKIP_SPARK_SETUP=1 Skip the Spark clone/reset/diff step. |
| 47 | + COMET_SPARK_DIR Spark checkout path (default: \$HOME/.cache/datafusion-comet/apache-spark). |
| 48 | + SPARK_REF Git ref for the Spark sources (default: v$SPARK_VERSION). |
| 49 | + SBT_MEM sbt heap size in MB (default: 4096). |
| 50 | + LC_ALL Locale for the sbt run (default: C.UTF-8; use en_US.UTF-8 on macOS). |
| 51 | +EOF |
| 52 | +} |
| 53 | + |
| 54 | +module="${1:-all}" |
| 55 | +case "$module" in |
| 56 | + -h|--help) usage; exit 0 ;; |
| 57 | +esac |
| 58 | + |
| 59 | +# Resolve the list of modules to run. |
| 60 | +modules_to_run=() |
| 61 | +if [ "$module" = "all" ]; then |
| 62 | + modules_to_run=("${SPARK_SQL_MODULES[@]}") |
| 63 | +elif module_sbt_args "$module" >/dev/null 2>&1; then |
| 64 | + modules_to_run=("$module") |
| 65 | +else |
| 66 | + echo "ERROR: unknown module '$module'" >&2 |
| 67 | + echo >&2 |
| 68 | + usage >&2 |
| 69 | + exit 1 |
| 70 | +fi |
| 71 | + |
| 72 | +# --- JDK version check (warning only) -------------------------------------- |
| 73 | +jdk_version="$(java -version 2>&1 | head -n1 | sed -E 's/.*version "([0-9]+).*/\1/')" |
| 74 | +if [ "$jdk_version" != "$REQUIRED_JDK" ]; then |
| 75 | + echo "WARNING: active JDK reports major version '$jdk_version'; Spark $SPARK_VERSION CI uses JDK $REQUIRED_JDK." >&2 |
| 76 | + echo " Set JAVA_HOME to a JDK $REQUIRED_JDK install to match CI exactly." >&2 |
| 77 | +fi |
| 78 | + |
| 79 | +# --- Build Comet ----------------------------------------------------------- |
| 80 | +if [ "${SKIP_BUILD:-}" = "1" ]; then |
| 81 | + echo "SKIP_BUILD=1: skipping Comet build." |
| 82 | +else |
| 83 | + echo "Building Comet (PROFILES=-Pspark-$SPARK_SHORT make release) ..." |
| 84 | + if ! ( cd "$COMET_REPO_ROOT" && PROFILES="-Pspark-$SPARK_SHORT" make release ); then |
| 85 | + echo "ERROR: Comet build failed." >&2 |
| 86 | + exit 1 |
| 87 | + fi |
| 88 | +fi |
| 89 | + |
| 90 | +# --- Purge partial Maven cache entries ------------------------------------- |
| 91 | +# Mirrors .github/actions/setup-spark-builder/action.yaml. Comet's Maven phase |
| 92 | +# downloads POMs for transitive artifacts whose JARs it never needs. sbt's |
| 93 | +# Coursier resolver then treats the POM-only entry as "found locally" and |
| 94 | +# fails on the missing JAR instead of fetching it remotely. Delete those |
| 95 | +# partial entries so sbt re-fetches the full artifact. |
| 96 | +maven_repo="$HOME/.m2/repository" |
| 97 | +if [ -d "$maven_repo" ]; then |
| 98 | + echo "Purging partial Maven cache entries ..." |
| 99 | + find "$maven_repo" -name '*.pom' | while read -r pom; do |
| 100 | + jar="${pom%.pom}.jar" |
| 101 | + [ -f "$jar" ] && continue |
| 102 | + grep -q '<packaging>jar</packaging>\|<packaging>bundle</packaging>' "$pom" 2>/dev/null || continue |
| 103 | + rm -f "$pom" "${pom}.sha1" "${pom%.pom}.pom.lastUpdated" \ |
| 104 | + "$(dirname "$pom")/_remote.repositories" |
| 105 | + done |
| 106 | +fi |
| 107 | + |
| 108 | +# --- Set up the Spark checkout --------------------------------------------- |
| 109 | +if [ "${SKIP_SPARK_SETUP:-}" = "1" ]; then |
| 110 | + echo "SKIP_SPARK_SETUP=1: using the existing Spark checkout as-is." |
| 111 | + if [ ! -d "$COMET_SPARK_DIR/.git" ]; then |
| 112 | + echo "ERROR: SKIP_SPARK_SETUP=1 but no Spark checkout at $COMET_SPARK_DIR" >&2 |
| 113 | + exit 1 |
| 114 | + fi |
| 115 | +else |
| 116 | + if ! "$SCRIPT_DIR/setup-spark.sh"; then |
| 117 | + echo "ERROR: Spark setup failed." >&2 |
| 118 | + exit 1 |
| 119 | + fi |
| 120 | +fi |
| 121 | + |
| 122 | +# --- Run the selected module shards ---------------------------------------- |
| 123 | +log_dir="$SCRIPT_DIR/logs" |
| 124 | +mkdir -p "$log_dir" |
| 125 | + |
| 126 | +results=() |
| 127 | +overall_status=0 |
| 128 | + |
| 129 | +for m in "${modules_to_run[@]}"; do |
| 130 | + sbt_args="$(module_sbt_args "$m")" |
| 131 | + log_file="$log_dir/${m}.log" |
| 132 | + echo |
| 133 | + echo "==================================================================" |
| 134 | + echo "Module: $m" |
| 135 | + echo "sbt args: $sbt_args" |
| 136 | + echo "Log file: $log_file" |
| 137 | + echo "==================================================================" |
| 138 | + |
| 139 | + # Stale Parquet cache workaround (mirrors spark_sql_test.yml). |
| 140 | + rm -rf "$maven_repo/org/apache/parquet" |
| 141 | + |
| 142 | + ( |
| 143 | + cd "$COMET_SPARK_DIR" || exit 1 |
| 144 | + NOLINT_ON_COMPILE=true \ |
| 145 | + ENABLE_COMET=true \ |
| 146 | + ENABLE_COMET_ONHEAP=true \ |
| 147 | + ENABLE_COMET_LOG_FALLBACK_REASONS=false \ |
| 148 | + SERIAL_SBT_TESTS=1 \ |
| 149 | + build/sbt -Dsbt.log.noformat=true -mem "$SBT_MEM" \ |
| 150 | + 'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \ |
| 151 | + "$sbt_args" |
| 152 | + ) 2>&1 | tee "$log_file" |
| 153 | + status="${PIPESTATUS[0]}" |
| 154 | + |
| 155 | + if [ "$status" -eq 0 ]; then |
| 156 | + results+=("PASS $m") |
| 157 | + else |
| 158 | + results+=("FAIL $m (sbt exit $status)") |
| 159 | + overall_status=1 |
| 160 | + fi |
| 161 | +done |
| 162 | + |
| 163 | +# --- Summary --------------------------------------------------------------- |
| 164 | +echo |
| 165 | +echo "==================================================================" |
| 166 | +echo "Spark SQL test summary (Spark $SPARK_VERSION)" |
| 167 | +echo "==================================================================" |
| 168 | +for line in "${results[@]}"; do |
| 169 | + echo " $line" |
| 170 | +done |
| 171 | +echo "Logs written to: $log_dir" |
| 172 | +exit "$overall_status" |
0 commit comments