Skip to content

Commit 9330c7f

Browse files
authored
Shard 5 dual-cluster jobs to speed up Multi-Cluster IT (#17695)
1 parent 81cab40 commit 9330c7f

1 file changed

Lines changed: 110 additions & 5 deletions

File tree

.github/workflows/pipe-it.yml

Lines changed: 110 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ jobs:
119119
name: cluster-log-single-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster1 }}-${{ matrix.cluster2 }}
120120
path: integration-test/target/cluster-logs
121121
retention-days: 30
122+
# 12 IT classes split across 3 parallel shards to cut the historical ~42 min
123+
# wall clock to ~14 min. See cluster-it-1c1d.yml for the shard pattern.
122124
dual-tree-auto-basic:
123125
strategy:
124126
fail-fast: false
@@ -128,6 +130,7 @@ jobs:
128130
# StrongConsistencyClusterMode is ignored now because RatisConsensus has not been supported yet.
129131
cluster: [HighPerformanceMode]
130132
os: [ubuntu-latest]
133+
shard: [0, 1, 2]
131134
runs-on: ${{ matrix.os }}
132135
steps:
133136
- uses: actions/checkout@v5
@@ -147,6 +150,21 @@ jobs:
147150
- name: Sleep for a random duration between 0 and 10000 milliseconds
148151
run: |
149152
sleep $(( $(( RANDOM % 10000 + 1 )) / 1000))
153+
- name: Build IT shard list
154+
shell: bash
155+
# See cluster-it-1c1d.yml for the shard-list pattern. Write under
156+
# $RUNNER_TEMP (outside the repo) so Apache RAT doesn't flag the file.
157+
run: |
158+
set -euo pipefail
159+
SHARD=${{ matrix.shard }}
160+
TOTAL=3
161+
grep -rlE --include='*IT.java' '\bMultiClusterIT2DualTreeAutoBasic\b' integration-test/src/test/java \
162+
| awk -F'/' '{print $NF}' | sed 's/\.java$//' \
163+
| sort \
164+
| awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
165+
> "$RUNNER_TEMP/it-shard.txt"
166+
echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes"
167+
head -5 "$RUNNER_TEMP/it-shard.txt"
150168
- name: IT Test
151169
shell: bash
152170
# we do not compile client-cpp for saving time, it is tested in client.yml
@@ -164,6 +182,9 @@ jobs:
164182
-DskipUTs \
165183
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
166184
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \
185+
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
186+
-DfailIfNoTests=false \
187+
-Dfailsafe.failIfNoSpecifiedTests=false \
167188
-pl integration-test \
168189
-am -PMultiClusterIT2DualTreeAutoBasic \
169190
-ntp >> ~/run-tests-$attempt.log && return 0
@@ -201,9 +222,11 @@ jobs:
201222
if: failure()
202223
uses: actions/upload-artifact@v6
203224
with:
204-
name: cluster-log-dual-tree-auto-basic-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster }}-${{ matrix.cluster }}
225+
name: cluster-log-dual-tree-auto-basic-shard${{ matrix.shard }}-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster }}-${{ matrix.cluster }}
205226
path: integration-test/target/cluster-logs
206227
retention-days: 30
228+
# 9 IT classes split across 3 parallel shards to cut the historical ~51 min
229+
# wall clock to ~17 min. See cluster-it-1c1d.yml for the shard pattern.
207230
dual-tree-auto-enhanced:
208231
strategy:
209232
fail-fast: false
@@ -214,6 +237,7 @@ jobs:
214237
cluster1: [HighPerformanceMode]
215238
cluster2: [HighPerformanceMode]
216239
os: [ubuntu-latest]
240+
shard: [0, 1, 2]
217241
runs-on: ${{ matrix.os }}
218242
steps:
219243
- uses: actions/checkout@v5
@@ -233,6 +257,21 @@ jobs:
233257
- name: Sleep for a random duration between 0 and 10000 milliseconds
234258
run: |
235259
sleep $(( $(( RANDOM % 10000 + 1 )) / 1000))
260+
- name: Build IT shard list
261+
shell: bash
262+
# See cluster-it-1c1d.yml for the shard-list pattern. Write under
263+
# $RUNNER_TEMP (outside the repo) so Apache RAT doesn't flag the file.
264+
run: |
265+
set -euo pipefail
266+
SHARD=${{ matrix.shard }}
267+
TOTAL=3
268+
grep -rlE --include='*IT.java' '\bMultiClusterIT2DualTreeAutoEnhanced\b' integration-test/src/test/java \
269+
| awk -F'/' '{print $NF}' | sed 's/\.java$//' \
270+
| sort \
271+
| awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
272+
> "$RUNNER_TEMP/it-shard.txt"
273+
echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes"
274+
head -5 "$RUNNER_TEMP/it-shard.txt"
236275
- name: IT Test
237276
shell: bash
238277
# we do not compile client-cpp for saving time, it is tested in client.yml
@@ -250,6 +289,9 @@ jobs:
250289
-DskipUTs \
251290
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
252291
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
292+
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
293+
-DfailIfNoTests=false \
294+
-Dfailsafe.failIfNoSpecifiedTests=false \
253295
-pl integration-test \
254296
-am -PMultiClusterIT2DualTreeAutoEnhanced \
255297
-ntp >> ~/run-tests-$attempt.log && return 0
@@ -287,9 +329,11 @@ jobs:
287329
if: failure()
288330
uses: actions/upload-artifact@v6
289331
with:
290-
name: cluster-log-dual-tree-auto-enhanced-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster1 }}-${{ matrix.cluster2 }}
332+
name: cluster-log-dual-tree-auto-enhanced-shard${{ matrix.shard }}-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster1 }}-${{ matrix.cluster2 }}
291333
path: integration-test/target/cluster-logs
292334
retention-days: 30
335+
# 11 IT classes split across 3 parallel shards to cut the historical ~27 min
336+
# wall clock to ~9 min. See cluster-it-1c1d.yml for the shard pattern.
293337
dual-tree-manual:
294338
strategy:
295339
fail-fast: false
@@ -300,6 +344,7 @@ jobs:
300344
cluster1: [HighPerformanceMode]
301345
cluster2: [HighPerformanceMode]
302346
os: [ubuntu-latest]
347+
shard: [0, 1, 2]
303348
runs-on: ${{ matrix.os }}
304349
steps:
305350
- uses: actions/checkout@v5
@@ -319,6 +364,21 @@ jobs:
319364
- name: Sleep for a random duration between 0 and 10000 milliseconds
320365
run: |
321366
sleep $(( $(( RANDOM % 10000 + 1 )) / 1000))
367+
- name: Build IT shard list
368+
shell: bash
369+
# See cluster-it-1c1d.yml for the shard-list pattern. Write under
370+
# $RUNNER_TEMP (outside the repo) so Apache RAT doesn't flag the file.
371+
run: |
372+
set -euo pipefail
373+
SHARD=${{ matrix.shard }}
374+
TOTAL=3
375+
grep -rlE --include='*IT.java' '\bMultiClusterIT2DualTreeManual\b' integration-test/src/test/java \
376+
| awk -F'/' '{print $NF}' | sed 's/\.java$//' \
377+
| sort \
378+
| awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
379+
> "$RUNNER_TEMP/it-shard.txt"
380+
echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes"
381+
head -5 "$RUNNER_TEMP/it-shard.txt"
322382
- name: IT Test
323383
shell: bash
324384
# we do not compile client-cpp for saving time, it is tested in client.yml
@@ -336,6 +396,9 @@ jobs:
336396
-DskipUTs \
337397
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
338398
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
399+
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
400+
-DfailIfNoTests=false \
401+
-Dfailsafe.failIfNoSpecifiedTests=false \
339402
-pl integration-test \
340403
-am -PMultiClusterIT2DualTreeManual \
341404
-ntp >> ~/run-tests-$attempt.log && return 0
@@ -373,7 +436,7 @@ jobs:
373436
if: failure()
374437
uses: actions/upload-artifact@v6
375438
with:
376-
name: cluster-log-dual-tree-manual-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster1 }}-${{ matrix.cluster2 }}
439+
name: cluster-log-dual-tree-manual-shard${{ matrix.shard }}-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster1 }}-${{ matrix.cluster2 }}
377440
path: integration-test/target/cluster-logs
378441
retention-days: 30
379442
subscription-tree-arch-verification:
@@ -720,6 +783,8 @@ jobs:
720783
name: cluster-log-subscription-tree-regression-misc-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster1 }}-${{ matrix.cluster2 }}
721784
path: integration-test/target/cluster-logs
722785
retention-days: 30
786+
# 13 IT classes split across 3 parallel shards to cut the historical ~63 min
787+
# wall clock to ~22 min. See cluster-it-1c1d.yml for the shard pattern.
723788
dual-table-manual-basic:
724789
strategy:
725790
fail-fast: false
@@ -729,6 +794,7 @@ jobs:
729794
# StrongConsistencyClusterMode is ignored now because RatisConsensus has not been supported yet.
730795
cluster: [HighPerformanceMode]
731796
os: [ubuntu-latest]
797+
shard: [0, 1, 2]
732798
runs-on: ${{ matrix.os }}
733799
steps:
734800
- uses: actions/checkout@v5
@@ -748,6 +814,21 @@ jobs:
748814
- name: Sleep for a random duration between 0 and 10000 milliseconds
749815
run: |
750816
sleep $(( $(( RANDOM % 10000 + 1 )) / 1000))
817+
- name: Build IT shard list
818+
shell: bash
819+
# See cluster-it-1c1d.yml for the shard-list pattern. Write under
820+
# $RUNNER_TEMP (outside the repo) so Apache RAT doesn't flag the file.
821+
run: |
822+
set -euo pipefail
823+
SHARD=${{ matrix.shard }}
824+
TOTAL=3
825+
grep -rlE --include='*IT.java' '\bMultiClusterIT2DualTableManualBasic\b' integration-test/src/test/java \
826+
| awk -F'/' '{print $NF}' | sed 's/\.java$//' \
827+
| sort \
828+
| awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
829+
> "$RUNNER_TEMP/it-shard.txt"
830+
echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes"
831+
head -5 "$RUNNER_TEMP/it-shard.txt"
751832
- name: IT Test
752833
shell: bash
753834
# we do not compile client-cpp for saving time, it is tested in client.yml
@@ -765,6 +846,9 @@ jobs:
765846
-DskipUTs \
766847
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
767848
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \
849+
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
850+
-DfailIfNoTests=false \
851+
-Dfailsafe.failIfNoSpecifiedTests=false \
768852
-pl integration-test \
769853
-am -PMultiClusterIT2DualTableManualBasic \
770854
-ntp >> ~/run-tests-$attempt.log && return 0
@@ -802,9 +886,11 @@ jobs:
802886
if: failure()
803887
uses: actions/upload-artifact@v6
804888
with:
805-
name: cluster-log-dual-table-manual-basic-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster }}-${{ matrix.cluster }}
889+
name: cluster-log-dual-table-manual-basic-shard${{ matrix.shard }}-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster }}-${{ matrix.cluster }}
806890
path: integration-test/target/cluster-logs
807891
retention-days: 30
892+
# 11 IT classes split across 3 parallel shards to cut the historical ~62 min
893+
# wall clock to ~22 min. See cluster-it-1c1d.yml for the shard pattern.
808894
dual-table-manual-enhanced:
809895
strategy:
810896
fail-fast: false
@@ -814,6 +900,7 @@ jobs:
814900
# StrongConsistencyClusterMode is ignored now because RatisConsensus has not been supported yet.
815901
cluster: [HighPerformanceMode]
816902
os: [ubuntu-latest]
903+
shard: [0, 1, 2]
817904
runs-on: ${{ matrix.os }}
818905
steps:
819906
- uses: actions/checkout@v5
@@ -833,6 +920,21 @@ jobs:
833920
- name: Sleep for a random duration between 0 and 10000 milliseconds
834921
run: |
835922
sleep $(( $(( RANDOM % 10000 + 1 )) / 1000))
923+
- name: Build IT shard list
924+
shell: bash
925+
# See cluster-it-1c1d.yml for the shard-list pattern. Write under
926+
# $RUNNER_TEMP (outside the repo) so Apache RAT doesn't flag the file.
927+
run: |
928+
set -euo pipefail
929+
SHARD=${{ matrix.shard }}
930+
TOTAL=3
931+
grep -rlE --include='*IT.java' '\bMultiClusterIT2DualTableManualEnhanced\b' integration-test/src/test/java \
932+
| awk -F'/' '{print $NF}' | sed 's/\.java$//' \
933+
| sort \
934+
| awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
935+
> "$RUNNER_TEMP/it-shard.txt"
936+
echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes"
937+
head -5 "$RUNNER_TEMP/it-shard.txt"
836938
- name: IT Test
837939
shell: bash
838940
# we do not compile client-cpp for saving time, it is tested in client.yml
@@ -850,6 +952,9 @@ jobs:
850952
-DskipUTs \
851953
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
852954
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \
955+
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
956+
-DfailIfNoTests=false \
957+
-Dfailsafe.failIfNoSpecifiedTests=false \
853958
-pl integration-test \
854959
-am -PMultiClusterIT2DualTableManualEnhanced \
855960
-ntp >> ~/run-tests-$attempt.log && return 0
@@ -887,7 +992,7 @@ jobs:
887992
if: failure()
888993
uses: actions/upload-artifact@v6
889994
with:
890-
name: cluster-log-dual-table-manual-enhanced-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster }}-${{ matrix.cluster }}
995+
name: cluster-log-dual-table-manual-enhanced-shard${{ matrix.shard }}-java${{ matrix.java }}-${{ runner.os }}-${{ matrix.cluster }}-${{ matrix.cluster }}
891996
path: integration-test/target/cluster-logs
892997
retention-days: 30
893998
triple:

0 commit comments

Comments
 (0)