Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 100 additions & 26 deletions .github/workflows/base.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: PyDeequ V2 Tests
Comment thread
chenliu0831 marked this conversation as resolved.

on:
push:
branches:
Expand All @@ -7,43 +9,115 @@ on:
- "master"

jobs:
test:
# V2 tests with Spark Connect (Python 3.12)
v2-tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- PYSPARK_VERSION: "3.1.3"
PYTHON_VERSION: "3.9"
JAVA_VERSION: "11"
- PYSPARK_VERSION: "3.2"
PYTHON_VERSION: "3.9"
JAVA_VERSION: "11"
- PYSPARK_VERSION: "3.3"
PYTHON_VERSION: "3.9"
JAVA_VERSION: "11"
- PYSPARK_VERSION: "3.5"
PYTHON_VERSION: "3.9"
JAVA_VERSION: "17"

env:
# Source for the deequ JAR built into the integration-test Spark Connect server.
# See ADR-0004 (deequ repo): the python-deequ wheel and the deequ JAR must be
# paired by release. During PR-review iteration we build from the deequ branch
# tip; flip to a SHA / merged-master / release tag before merging the pair.
DEEQU_REPO: chenliu0831/deequ
DEEQU_REF: protobuf-stage1

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
name: Install Python 3.12
with:
python-version: ${{matrix.PYTHON_VERSION}}
python-version: "3.12"

- uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4.7.1
name: Setup Java 17
with:
java-version: ${{matrix.JAVA_VERSION}}
distribution: "temurin"
java-version: "17"

- name: Running tests with pyspark==${{matrix.PYSPARK_VERSION}}
env:
SPARK_VERSION: ${{matrix.PYSPARK_VERSION}}
# Side-checkout of the deequ source so we can build the JAR with the
# matching wire-format schema in this same job.
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
name: Checkout deequ
with:
repository: ${{ env.DEEQU_REPO }}
ref: ${{ env.DEEQU_REF }}
path: deequ-source

# Cache Maven local repo so subsequent CI runs don't re-download the
# ~hundreds-of-MB plugin/dependency jars. Key on the deequ pom + the
# protobuf-maven-plugin major version so the cache invalidates whenever
# build dependencies change.
- uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
name: Cache Maven local repo
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('deequ-source/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-

- name: Build Deequ JAR
# Produces target/deequ_2.12-X.Y.Z-spark-3.5.jar with META-INF/protobuf/
# NOT shipped (per ADR-0005 — schema lives in source, not in the JAR).
# Pins the JAR path into $GITHUB_ENV so subsequent steps reference it.
run: |
cd deequ-source
mvn -B -ntp -DskipTests package
DEEQU_JAR=$(ls target/deequ_2.12-*-spark-3.5.jar | grep -v 'tests\|original' | head -1)
if [ -z "$DEEQU_JAR" ] || [ ! -f "$DEEQU_JAR" ]; then
echo "ERROR: deequ JAR not produced under deequ-source/target/"
ls -la target/ || true
exit 1
fi
echo "DEEQU_JAR=$PWD/$DEEQU_JAR" >> $GITHUB_ENV
echo "Built deequ JAR: $PWD/$DEEQU_JAR"

- name: Download Spark 3.5
run: |
curl -L -o spark-3.5.0-bin-hadoop3.tgz \
https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
tar -xzf spark-3.5.0-bin-hadoop3.tgz
echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV

- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install poetry==1.7.1
pip install --upgrade pip setuptools
pip install "poetry>=2.0,<3.0"
poetry install
poetry run pip install pyspark==$SPARK_VERSION
poetry run python -m pytest -s tests --ignore=tests/test_bot.py
poetry add "pyspark[connect]==3.5.0"

- name: Verify checked-in proto stubs are not stale
# ADR-0005 drift safeguard: regenerate the stubs from the deequ
# source tree we just cloned, assert the diff is empty. Catches
# cases where the .proto changed in deequ but pydeequ/v2/proto/*_pb2.py
# weren't refreshed via scripts/regen_proto.py.
run: |
DEEQU_PROTO_DIR=$PWD/deequ-source/src/main/protobuf \
poetry run python scripts/regen_proto.py
git diff --exit-code pydeequ/v2/proto/

- name: Run V2 unit tests
run: |
poetry run pytest tests/v2/test_unit.py -v

- name: Start Spark Connect Server
run: |
$SPARK_HOME/sbin/start-connect-server.sh \
--packages org.apache.spark:spark-connect_2.12:3.5.0 \
--jars $DEEQU_JAR \
--conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin
# Wait for the gRPC port to accept connections (fail fast if startup broke)
timeout 60 bash -c 'until (echo > /dev/tcp/localhost/15002) >/dev/null 2>&1; do sleep 1; done'
# Verify server is running
ps aux | grep SparkConnectServer | grep -v grep

- name: Run V2 integration tests
env:
SPARK_REMOTE: "sc://localhost:15002"
run: |
poetry run pytest tests/v2/ -v --ignore=tests/v2/test_unit.py

- name: Stop Spark Connect Server
if: always()
run: |
$SPARK_HOME/sbin/stop-connect-server.sh || true
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -148,5 +148,12 @@ dmypy.json
# Cython debug symbols
cython_debug/

# DS_STORE
# Note: pydeequ/v2/proto/deequ_connect_pb2.py and _pb2.pyi are
# CHECKED IN per ADR-0005 (graphframes pattern). Run scripts/regen_proto.py
# to refresh them when the deequ schema changes.
# The intermediate .proto extracted by regen_proto.py is NOT checked in —
# the canonical schema lives in the deequ repo.
pydeequ/v2/proto/deequ_connect.proto

# DS_STORE
.DS_Store
Loading
Loading