Skip to content

Commit 4c3145f

Browse files
committed
Fix CI by pining to deequ working branch for now
1 parent 5f8a5a1 commit 4c3145f

2 files changed

Lines changed: 134 additions & 24 deletions

File tree

.github/workflows/base.yml

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@ jobs:
1313
v2-tests:
1414
runs-on: ubuntu-latest
1515

16+
env:
17+
# Source for the deequ JAR built into the integration-test Spark Connect server.
18+
# See ADR-0004 (deequ repo): the python-deequ wheel and the deequ JAR must be
19+
# paired by release. During PR-review iteration we build from the deequ branch
20+
# tip; flip to a SHA / merged-master / release tag before merging the pair.
21+
DEEQU_REPO: chenliu0831/deequ
22+
DEEQU_REF: protobuf-stage1
23+
1624
steps:
1725
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1826

@@ -27,38 +35,65 @@ jobs:
2735
distribution: "temurin"
2836
java-version: "17"
2937

38+
# Side-checkout of the deequ source so we can build the JAR with the
39+
# matching wire-format schema in this same job.
40+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41+
name: Checkout deequ
42+
with:
43+
repository: ${{ env.DEEQU_REPO }}
44+
ref: ${{ env.DEEQU_REF }}
45+
path: deequ-source
46+
47+
# Cache Maven local repo so subsequent CI runs don't re-download the
48+
# ~hundreds-of-MB plugin/dependency jars. Key on the deequ pom + the
49+
# protobuf-maven-plugin major version so the cache invalidates whenever
50+
# build dependencies change.
51+
- uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
52+
name: Cache Maven local repo
53+
with:
54+
path: ~/.m2/repository
55+
key: ${{ runner.os }}-maven-${{ hashFiles('deequ-source/pom.xml') }}
56+
restore-keys: |
57+
${{ runner.os }}-maven-
58+
59+
- name: Build Deequ JAR
60+
# Produces target/deequ_2.12-X.Y.Z-spark-3.5.jar with META-INF/protobuf/
61+
# NOT shipped (per ADR-0005 — schema lives in source, not in the JAR).
62+
# Pins the JAR path into $GITHUB_ENV so subsequent steps reference it.
63+
run: |
64+
cd deequ-source
65+
mvn -B -ntp -DskipTests package
66+
DEEQU_JAR=$(ls target/deequ_2.12-*-spark-3.5.jar | grep -v 'tests\|original' | head -1)
67+
if [ -z "$DEEQU_JAR" ] || [ ! -f "$DEEQU_JAR" ]; then
68+
echo "ERROR: deequ JAR not produced under deequ-source/target/"
69+
ls -la target/ || true
70+
exit 1
71+
fi
72+
echo "DEEQU_JAR=$PWD/$DEEQU_JAR" >> $GITHUB_ENV
73+
echo "Built deequ JAR: $PWD/$DEEQU_JAR"
74+
3075
- name: Download Spark 3.5
3176
run: |
3277
curl -L -o spark-3.5.0-bin-hadoop3.tgz \
3378
https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
3479
tar -xzf spark-3.5.0-bin-hadoop3.tgz
3580
echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV
3681
37-
- name: Download Deequ JAR
38-
# The pinned JAR must match the schema in pydeequ/v2/proto/.
39-
# When the schema changes, both the deequ JAR (built from the
40-
# corresponding deequ branch) and this URL need to update in the
41-
# same PR pair (per ADR-0004 in the deequ repo).
42-
run: |
43-
curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \
44-
https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar
45-
4682
- name: Install Python dependencies
4783
run: |
4884
pip install --upgrade pip setuptools
49-
pip install poetry==1.7.1
85+
pip install "poetry>=2.0,<3.0"
5086
poetry install
5187
poetry add "pyspark[connect]==3.5.0"
5288
5389
- name: Verify checked-in proto stubs are not stale
54-
# ADR-0005 drift safeguard: regenerate the stubs from the JAR's
55-
# bundled .proto and assert the diff is empty.
56-
# NOTE: this requires a JAR that ships META-INF/protobuf/deequ_connect.proto
57-
# — only deequ JARs built from the protobuf-stage1 branch onward do.
58-
# Until the matching JAR is released, this step is skipped.
59-
if: false
90+
# ADR-0005 drift safeguard: regenerate the stubs from the deequ
91+
# source tree we just cloned, assert the diff is empty. Catches
92+
# cases where the .proto changed in deequ but pydeequ/v2/proto/*_pb2.py
93+
# weren't refreshed via scripts/regen_proto.py.
6094
run: |
61-
DEEQU_JAR_PATH=$PWD/deequ_2.12-2.1.0b-spark-3.5.jar poetry run python scripts/regen_proto.py
95+
DEEQU_PROTO_DIR=$PWD/deequ-source/src/main/protobuf \
96+
poetry run python scripts/regen_proto.py
6297
git diff --exit-code pydeequ/v2/proto/
6398
6499
- name: Run V2 unit tests
@@ -69,7 +104,7 @@ jobs:
69104
run: |
70105
$SPARK_HOME/sbin/start-connect-server.sh \
71106
--packages org.apache.spark:spark-connect_2.12:3.5.0 \
72-
--jars $PWD/deequ_2.12-2.1.0b-spark-3.5.jar \
107+
--jars $DEEQU_JAR \
73108
--conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin
74109
# Wait for the gRPC port to accept connections (fail fast if startup broke)
75110
timeout 60 bash -c 'until (echo > /dev/tcp/localhost/15002) >/dev/null 2>&1; do sleep 1; done'

poetry.lock

Lines changed: 81 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)