Skip to content

Commit e3e93af

Browse files
committed
Add valgaurd integration tests
1 parent 90d46c3 commit e3e93af

1 file changed

Lines changed: 82 additions & 0 deletions

File tree

.semaphore/semaphore.yml

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -863,6 +863,88 @@ blocks:
863863
# shell before the agent captures status, so green runs get recorded
864864
# as failed and the epilogue's artifact push never runs.
865865
[ "$rc" -eq 0 ]
866+
# KIP-932: share-consumer integration tests under Valgrind, full-suite per-PR.
867+
# The binding block above keeps Valgrind broker-free on purpose (Memcheck's
868+
# slowdown vs the 1s share lock) — this runs the whole integration suite anyway
869+
# as a per-PR experiment. Expect timing tests (lock expiry/redelivery/throttle)
870+
# to flake and the job to be slow against the 3h cap; curate or promote later.
871+
- name: "Share consumer integration tests under Valgrind"
872+
dependencies: []
873+
task:
874+
agent:
875+
machine:
876+
type: s1-prod-ubuntu24-04-amd64-2
877+
env_vars:
878+
- name: OS_NAME
879+
value: linux
880+
- name: ARCH
881+
value: x64
882+
epilogue:
883+
always:
884+
commands:
885+
- cp valgrind-share-integration-tests.log artifacts/ || true
886+
- artifact push workflow artifacts/ --destination artifacts/valgrind-integration/ || true
887+
jobs:
888+
- name: "ShareConsumer integration (Valgrind memcheck)"
889+
commands:
890+
- sem-version python 3.11
891+
- sem-version java 17
892+
- pip install uv
893+
- uv venv _venv --python "$(command -v python)" && source _venv/bin/activate
894+
- uv pip install -r requirements/requirements-tests-install.txt
895+
- sudo apt-get update -qq && sudo apt-get install -y -qq valgrind
896+
# Non-ASAN debug build (-g, -O0) so Memcheck stacks are real and the
897+
# optimizer doesn't fake uninitialized-read reports. Same build as the
898+
# binding-layer Valgrind job.
899+
- lib_dir=dest/runtimes/$OS_NAME-$ARCH/native
900+
- LIBRDKAFKA_DEBUG=1 tools/wheels/build-librdkafka-branch.sh "$LIBRDKAFKA_BRANCH" dest
901+
- export CFLAGS="-g -O0 -I${PWD}/dest/build/native/include $CFLAGS"
902+
- export LDFLAGS="-L${PWD}/${lib_dir} $LDFLAGS"
903+
- export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/$lib_dir"
904+
- uv pip install -e .
905+
# Valgrind wraps the pytest process inside trivup's --cmd; the broker is
906+
# a separate JVM trivup starts outside --cmd, so it never runs under
907+
# Memcheck. Suppression and test paths are relative to the checkout —
908+
# trivup runs --cmd from the repo root (same as the ASAN job's relative
909+
# pytest path) — and the inner 2>&1 routes valgrind's stderr report into
910+
# the tee'd log, not just pytest's stdout. --conf matches the ASAN
911+
# integration job (1s share lock, single-broker RF/ISR=1); --timeout is
912+
# large because Memcheck's slowdown stacks on the broker round-trips.
913+
- |
914+
set -o pipefail
915+
export PYTHONPATH="${PWD}"
916+
python -m trivup.clusters.KafkaCluster \
917+
--kraft \
918+
--version 4.2.0 \
919+
--conf '["transaction.state.log.replication.factor=1","transaction.state.log.min.isr=1","offsets.topic.replication.factor=1","offsets.topic.min.isr=1","share.coordinator.state.topic.replication.factor=1","share.coordinator.state.topic.min.isr=1","group.share.record.lock.duration.ms=1000","group.share.min.record.lock.duration.ms=1000"]' \
920+
--cmd 'PYTHONMALLOC=malloc valgrind --error-exitcode=42 --exit-on-first-error=no --leak-check=summary --track-origins=yes --num-callers=40 --suppressions=.semaphore/librdkafka.suppressions --suppressions=.semaphore/valgrind-python.supp python -m pytest -v --timeout=1800 tests/integration/share_consumer/ 2>&1' \
921+
2>&1 | tee valgrind-share-integration-tests.log || true
922+
# The `|| true` keeps set -e from aborting on python's shutdown exit-1
923+
# so the gate runs. Don't trust $?: trivup runs --cmd via /bin/sh (no
924+
# pipefail) and python exits 1 at interpreter shutdown even on a green
925+
# run; with --error-exitcode=42 valgrind just forwards that.
926+
rc=0
927+
# Memcheck findings — uninit reads / invalid access, the class ASAN misses.
928+
if grep -Eq 'ERROR SUMMARY: [1-9]' valgrind-share-integration-tests.log; then
929+
echo "Valgrind reported Memcheck errors above — failing the job."
930+
rc=1
931+
fi
932+
# Real pytest failures/errors. Anchored on '^= ' so it matches pytest's
933+
# summary banner and not the '==PID==' Memcheck report lines.
934+
if grep -Eq '^=+ .*[0-9]+ (failed|error)' valgrind-share-integration-tests.log; then
935+
echo "pytest reported test failures above — failing the job."
936+
rc=1
937+
fi
938+
# A clean run ends with a green pytest summary; its absence means the
939+
# cluster never came up or pytest crashed/timed out.
940+
if ! grep -Eq '^=+ .*[0-9]+ passed' valgrind-share-integration-tests.log; then
941+
echo "No passing pytest summary found — the run did not complete."
942+
rc=1
943+
fi
944+
# Not `exit $rc`: an explicit exit (even 0) kills Semaphore's command
945+
# shell before the agent captures status, so green runs get recorded
946+
# as failed and the epilogue's artifact push never runs.
947+
[ "$rc" -eq 0 ]
866948
promotions:
867949
- name: "Publish to Test PyPI"
868950
pipeline_file: publish-test-pypi.yml

0 commit comments

Comments
 (0)