-
Notifications
You must be signed in to change notification settings - Fork 1.5k
139 lines (125 loc) · 4.55 KB
/
Copy pathflakey-test-repro.yml
File metadata and controls
139 lines (125 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Manual stress-repro workflow for flakey integration tests.
#
# Run a single pytest node id many times, in parallel, under CPU pressure,
# to maximize the chance of catching timing-sensitive flakes. Uploads the
# pytest log from any shard that catches a failure.
#
# Trigger from the Actions UI via "Run workflow" (workflow_dispatch only) -
# does NOT run on PRs or pushes, to avoid burning runner minutes on every
# commit. Defaults target the test_heartbeat_thread rejoin hang, but any
# pytest node id can be supplied.
#
name: Flakey test repro
on:
workflow_dispatch:
inputs:
test_node:
description: "pytest node id to stress"
required: true
default: "test/integration/test_consumer_group.py::test_heartbeat_thread"
count:
description: "pytest-repeat --count per shard"
required: true
default: "100"
parallelism:
description: "pytest-xdist workers per shard"
required: true
default: "4"
cpu_load:
description: "stress-ng CPU workers running alongside (0 disables)"
required: true
default: "2"
kafka_version:
description: "Kafka broker version"
required: true
default: "4.3.0"
python_version:
description: "Python version"
required: true
default: "3.14"
env:
FORCE_COLOR: "1"
PIP_DISABLE_PIP_VERSION_CHECK: "1"
PIP_NO_PYTHON_VERSION_WARNING: "1"
jobs:
stress:
runs-on: ubuntu-latest
name: "Stress shard ${{ matrix.shard }}"
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
# Four independent shards on four runners. Each shard runs the same
# stress config; multiplying shards multiplies aggregate iterations
# without serializing. A shard that catches the bug exits fast (-x);
# other shards continue so we get as many traces as possible.
shard: [1, 2, 3, 4]
steps:
- uses: actions/checkout@v7
- name: Set up Python ${{ github.event.inputs.python_version }}
uses: actions/setup-python@v6
with:
python-version: ${{ github.event.inputs.python_version }}
cache: pip
cache-dependency-path: |
requirements-dev.txt
- name: Install dependencies
run: |
sudo apt install -y libsnappy-dev libzstd-dev stress-ng
python -m pip install --upgrade pip
pip install -r requirements-dev.txt
pip install pytest-repeat pytest-xdist
- name: Setup java
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 23
- name: Restore cached kafka releases
id: cache-servers-dist-restore
uses: actions/cache/restore@v6
with:
path: servers/dist
key: servers-dist-${{ github.event.inputs.kafka_version }}
- name: Install Kafka release
run: make servers/${{ github.event.inputs.kafka_version }}/kafka-bin
- name: Update kafka release cache
uses: actions/cache/save@v6
with:
path: servers/dist
key: ${{ steps.cache-servers-dist-restore.outputs.cache-primary-key }}
- name: Start CPU load
if: github.event.inputs.cpu_load != '0'
run: |
# Background stress-ng; capture PID so we can stop it in the
# cleanup step regardless of whether pytest passed or failed.
stress-ng --cpu ${{ github.event.inputs.cpu_load }} \
--timeout 3600s \
--metrics-brief &
echo "STRESS_PID=$!" >> "$GITHUB_ENV"
- name: Run stress test
id: pytest
run: |
# -x stops on first failure so a shard that catches the bug exits
# fast and uploads logs. --log-cli-level=DEBUG surfaces the debug
# statements already in the coordinator for task #12.
pytest "${{ github.event.inputs.test_node }}" \
--count=${{ github.event.inputs.count }} \
-n ${{ github.event.inputs.parallelism }} \
--timeout=120 \
-x \
-v \
--log-cli-level=DEBUG \
2>&1 | tee pytest.log
env:
KAFKA_VERSION: ${{ github.event.inputs.kafka_version }}
- name: Stop CPU load
if: always() && env.STRESS_PID != ''
run: kill "$STRESS_PID" || true
- name: Upload logs on failure
if: failure()
uses: actions/upload-artifact@v7
with:
name: repro-logs-shard-${{ matrix.shard }}
path: |
pytest.log
if-no-files-found: warn