1313 v2-tests :
1414 runs-on : ubuntu-latest
1515
16+ env :
17+ # Source for the deequ JAR built into the integration-test Spark Connect server.
18+ # See ADR-0004 (deequ repo): the python-deequ wheel and the deequ JAR must be
19+ # paired by release. During PR-review iteration we build from the deequ branch
20+ # tip; flip to a SHA / merged-master / release tag before merging the pair.
21+ DEEQU_REPO : chenliu0831/deequ
22+ DEEQU_REF : protobuf-stage1
23+
1624 steps :
1725 - uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1826
@@ -27,38 +35,65 @@ jobs:
2735 distribution : " temurin"
2836 java-version : " 17"
2937
38+ # Side-checkout of the deequ source so we can build the JAR with the
39+ # matching wire-format schema in this same job.
40+ - uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41+ name : Checkout deequ
42+ with :
43+ repository : ${{ env.DEEQU_REPO }}
44+ ref : ${{ env.DEEQU_REF }}
45+ path : deequ-source
46+
47+ # Cache Maven local repo so subsequent CI runs don't re-download the
48+ # ~hundreds-of-MB plugin/dependency jars. Key on the deequ pom + the
49+ # protobuf-maven-plugin major version so the cache invalidates whenever
50+ # build dependencies change.
51+ - uses : actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
52+ name : Cache Maven local repo
53+ with :
54+ path : ~/.m2/repository
55+ key : ${{ runner.os }}-maven-${{ hashFiles('deequ-source/pom.xml') }}
56+ restore-keys : |
57+ ${{ runner.os }}-maven-
58+
59+ - name : Build Deequ JAR
60+ # Produces target/deequ_2.12-X.Y.Z-spark-3.5.jar with META-INF/protobuf/
61+ # NOT shipped (per ADR-0005 — schema lives in source, not in the JAR).
62+ # Pins the JAR path into $GITHUB_ENV so subsequent steps reference it.
63+ run : |
64+ cd deequ-source
65+ mvn -B -ntp -DskipTests package
66+ DEEQU_JAR=$(ls target/deequ_2.12-*-spark-3.5.jar | grep -v 'tests\|original' | head -1)
67+ if [ -z "$DEEQU_JAR" ] || [ ! -f "$DEEQU_JAR" ]; then
68+ echo "ERROR: deequ JAR not produced under deequ-source/target/"
69+ ls -la target/ || true
70+ exit 1
71+ fi
72+ echo "DEEQU_JAR=$PWD/$DEEQU_JAR" >> $GITHUB_ENV
73+ echo "Built deequ JAR: $PWD/$DEEQU_JAR"
74+
3075 - name : Download Spark 3.5
3176 run : |
3277 curl -L -o spark-3.5.0-bin-hadoop3.tgz \
3378 https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
3479 tar -xzf spark-3.5.0-bin-hadoop3.tgz
3580 echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV
3681
37- - name : Download Deequ JAR
38- # The pinned JAR must match the schema in pydeequ/v2/proto/.
39- # When the schema changes, both the deequ JAR (built from the
40- # corresponding deequ branch) and this URL need to update in the
41- # same PR pair (per ADR-0004 in the deequ repo).
42- run : |
43- curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \
44- https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar
45-
4682 - name : Install Python dependencies
4783 run : |
4884 pip install --upgrade pip setuptools
49- pip install poetry==1.7.1
85+ pip install " poetry>=2.0,<3.0"
5086 poetry install
5187 poetry add "pyspark[connect]==3.5.0"
5288
5389 - name : Verify checked-in proto stubs are not stale
54- # ADR-0005 drift safeguard: regenerate the stubs from the JAR's
55- # bundled .proto and assert the diff is empty.
56- # NOTE: this requires a JAR that ships META-INF/protobuf/deequ_connect.proto
57- # — only deequ JARs built from the protobuf-stage1 branch onward do.
58- # Until the matching JAR is released, this step is skipped.
59- if : false
90+ # ADR-0005 drift safeguard: regenerate the stubs from the deequ
91+ # source tree we just cloned, assert the diff is empty. Catches
92+ # cases where the .proto changed in deequ but pydeequ/v2/proto/*_pb2.py
93+ # weren't refreshed via scripts/regen_proto.py.
6094 run : |
61- DEEQU_JAR_PATH=$PWD/deequ_2.12-2.1.0b-spark-3.5.jar poetry run python scripts/regen_proto.py
95+ DEEQU_PROTO_DIR=$PWD/deequ-source/src/main/protobuf \
96+ poetry run python scripts/regen_proto.py
6297 git diff --exit-code pydeequ/v2/proto/
6398
6499 - name : Run V2 unit tests
69104 run : |
70105 $SPARK_HOME/sbin/start-connect-server.sh \
71106 --packages org.apache.spark:spark-connect_2.12:3.5.0 \
72- --jars $PWD/deequ_2.12-2.1.0b-spark-3.5.jar \
107+ --jars $DEEQU_JAR \
73108 --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin
74109 # Wait for the gRPC port to accept connections (fail fast if startup broke)
75110 timeout 60 bash -c 'until (echo > /dev/tcp/localhost/15002) >/dev/null 2>&1; do sleep 1; done'
0 commit comments