@@ -863,6 +863,88 @@ blocks:
863863 # shell before the agent captures status, so green runs get recorded
864864 # as failed and the epilogue's artifact push never runs.
865865 [ "$rc" -eq 0 ]
866+ # KIP-932: share-consumer integration tests under Valgrind, full-suite per-PR.
867+ # The binding block above keeps Valgrind broker-free on purpose (Memcheck's
868+ # slowdown vs the 1s share lock) — this runs the whole integration suite anyway
869+ # as a per-PR experiment. Expect timing tests (lock expiry/redelivery/throttle)
870+ # to flake and the job to be slow against the 3h cap; curate or promote later.
871+ - name : " Share consumer integration tests under Valgrind"
872+ dependencies : []
873+ task :
874+ agent :
875+ machine :
876+ type : s1-prod-ubuntu24-04-amd64-2
877+ env_vars :
878+ - name : OS_NAME
879+ value : linux
880+ - name : ARCH
881+ value : x64
882+ epilogue :
883+ always :
884+ commands :
885+ - cp valgrind-share-integration-tests.log artifacts/ || true
886+ - artifact push workflow artifacts/ --destination artifacts/valgrind-integration/ || true
887+ jobs :
888+ - name : " ShareConsumer integration (Valgrind memcheck)"
889+ commands :
890+ - sem-version python 3.11
891+ - sem-version java 17
892+ - pip install uv
893+ - uv venv _venv --python "$(command -v python)" && source _venv/bin/activate
894+ - uv pip install -r requirements/requirements-tests-install.txt
895+ - sudo apt-get update -qq && sudo apt-get install -y -qq valgrind
896+ # Non-ASAN debug build (-g, -O0) so Memcheck stacks are real and the
897+ # optimizer doesn't fake uninitialized-read reports. Same build as the
898+ # binding-layer Valgrind job.
899+ - lib_dir=dest/runtimes/$OS_NAME-$ARCH/native
900+ - LIBRDKAFKA_DEBUG=1 tools/wheels/build-librdkafka-branch.sh "$LIBRDKAFKA_BRANCH" dest
901+ - export CFLAGS="-g -O0 -I${PWD}/dest/build/native/include $CFLAGS"
902+ - export LDFLAGS="-L${PWD}/${lib_dir} $LDFLAGS"
903+ - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/$lib_dir"
904+ - uv pip install -e .
905+ # Valgrind wraps the pytest process inside trivup's --cmd; the broker is
906+ # a separate JVM trivup starts outside --cmd, so it never runs under
907+ # Memcheck. Suppression and test paths are relative to the checkout —
908+ # trivup runs --cmd from the repo root (same as the ASAN job's relative
909+ # pytest path) — and the inner 2>&1 routes valgrind's stderr report into
910+ # the tee'd log, not just pytest's stdout. --conf matches the ASAN
911+ # integration job (1s share lock, single-broker RF/ISR=1); --timeout is
912+ # large because Memcheck's slowdown stacks on the broker round-trips.
913+ - |
914+ set -o pipefail
915+ export PYTHONPATH="${PWD}"
916+ python -m trivup.clusters.KafkaCluster \
917+ --kraft \
918+ --version 4.2.0 \
919+ --conf '["transaction.state.log.replication.factor=1","transaction.state.log.min.isr=1","offsets.topic.replication.factor=1","offsets.topic.min.isr=1","share.coordinator.state.topic.replication.factor=1","share.coordinator.state.topic.min.isr=1","group.share.record.lock.duration.ms=1000","group.share.min.record.lock.duration.ms=1000"]' \
920+ --cmd 'PYTHONMALLOC=malloc valgrind --error-exitcode=42 --exit-on-first-error=no --leak-check=summary --track-origins=yes --num-callers=40 --suppressions=.semaphore/librdkafka.suppressions --suppressions=.semaphore/valgrind-python.supp python -m pytest -v --timeout=1800 tests/integration/share_consumer/ 2>&1' \
921+ 2>&1 | tee valgrind-share-integration-tests.log || true
922+ # The `|| true` keeps set -e from aborting on python's shutdown exit-1
923+ # so the gate runs. Don't trust $?: trivup runs --cmd via /bin/sh (no
924+ # pipefail) and python exits 1 at interpreter shutdown even on a green
925+ # run; with --error-exitcode=42 valgrind just forwards that.
926+ rc=0
927+ # Memcheck findings — uninit reads / invalid access, the class ASAN misses.
928+ if grep -Eq 'ERROR SUMMARY: [1-9]' valgrind-share-integration-tests.log; then
929+ echo "Valgrind reported Memcheck errors above — failing the job."
930+ rc=1
931+ fi
932+ # Real pytest failures/errors. Anchored on '^= ' so it matches pytest's
933+ # summary banner and not the '==PID==' Memcheck report lines.
934+ if grep -Eq '^=+ .*[0-9]+ (failed|error)' valgrind-share-integration-tests.log; then
935+ echo "pytest reported test failures above — failing the job."
936+ rc=1
937+ fi
938+ # A clean run ends with a green pytest summary; its absence means the
939+ # cluster never came up or pytest crashed/timed out.
940+ if ! grep -Eq '^=+ .*[0-9]+ passed' valgrind-share-integration-tests.log; then
941+ echo "No passing pytest summary found — the run did not complete."
942+ rc=1
943+ fi
944+ # Not `exit $rc`: an explicit exit (even 0) kills Semaphore's command
945+ # shell before the agent captures status, so green runs get recorded
946+ # as failed and the epilogue's artifact push never runs.
947+ [ "$rc" -eq 0 ]
866948promotions :
867949 - name : " Publish to Test PyPI"
868950 pipeline_file : publish-test-pypi.yml
0 commit comments